Scaleway NVIDIA H100 Performance evaluation - Ollama
Go back to listOllama is an open-source tool allowing the usage of Large Language Model (LLM) with different types and size of models. In this study we used the official Llama2 model in its 3 sizes:
- 7 billions of parameters: 8GB of VRAM
- 13B: 16GB
- 70B: 32GB
The number of parameters theorically increase its efficiency but with the counterpart of requiring more memory from GPUs.
As our evalutation isn't about the model efficiency and its capacity to elaborate good answers, we focus on the ability of the system to produce quickly an response in token per seconds. A token being generally a word in a sentence.
Here an example of test invocation:
$ ollama run llama2 --verbose 'Write a 100 word paragraph about weather.'
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "A100-SXM4-40GB",
"data": [
[
0,
108.43722222222222
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "A10G",
"data": [
[
1,
84.86333333333333
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "L4",
"data": [
[
2,
50.90428571428571
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA A40",
"data": [
[
3,
94.9889880952381
]
],
"grouping": false,
"color": "#d20000"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
4,
160.78670454545454
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Telsa T4",
"data": [
[
5,
47.31517241379311
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100",
"data": [
[
6,
21.73
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
7,
22.0405
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla V100-PCIE-16GB",
"data": [
[
8,
110.15955555555556
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "Tesla V100-SXM2-16GB",
"data": [
[
9,
117.11066666666667
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
10,
121.96701149425286
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2:13b"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "A100-SXM4-40GB",
"data": [
[
0,
79.86888888888889
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "A10G",
"data": [
[
1,
52.69833333333333
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "L4",
"data": [
[
2,
29.94962962962963
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA A40",
"data": [
[
3,
57.23049079754601
]
],
"grouping": false,
"color": "#d20000"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
4,
108.09458823529413
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Telsa T4",
"data": [
[
5,
26.076999999999998
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100",
"data": [
[
6,
12.213333333333333
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
7,
12.511621621621622
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla V100-PCIE-16GB",
"data": [
[
8,
67.29761904761905
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "Tesla V100-SXM2-16GB",
"data": [
[
9,
70.5474074074074
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
10,
77.16588235294118
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2:13b"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2:70b"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "A100-SXM4-40GB",
"data": [
[
0,
1.7875
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "L4",
"data": [
[
1,
1.0558333333333334
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
2,
30.475540540540543
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla P100",
"data": [
[
3,
0.4325
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
4,
0.7361538461538462
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
5,
2.4988235294117644
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2:70b"
}
Cloud Mercato's observations
- As the large model require 32GB of VRAM, the panel is very restricted in this category
- H100 has a performance gap of 30 token/sec