Scaleway NVIDIA H100 Performance evaluation - Ollama
Go back to listOllama is an open-source tool allowing the usage of Large Language Model (LLM) with different types and size of models. In this study we used the official Llama2 model in its 3 sizes:
- 7 billions of parameters: 8GB of VRAM
- 13B: 16GB
- 70B: 32GB
The number of parameters theorically increase its efficiency but with the counterpart of requiring more memory from GPUs.
As our evalutation isn't about the model efficiency and its capacity to elaborate good answers, we focus on the ability of the system to produce quickly an response in token per seconds. A token being generally a word in a sentence.
Here an example of test invocation:
$ ollama run llama2 --verbose 'Write a 100 word paragraph about weather.'
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "NVIDIA A100-SXM4-40GB",
"data": [
[
0,
108.43722222222222
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA A10G",
"data": [
[
1,
84.86333333333333
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "NVIDIA A40",
"data": [
[
2,
94.9889880952381
]
],
"grouping": false,
"color": "#d20000"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
3,
160.78670454545454
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "NVIDIA L4",
"data": [
[
4,
50.90428571428571
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Tesla P100",
"data": [
[
5,
21.73
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
6,
22.0405
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla T4",
"data": [
[
7,
47.31517241379311
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla V100-PCIE-16GB",
"data": [
[
8,
110.15955555555556
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "Tesla V100-SXM2-16GB",
"data": [
[
9,
117.11066666666667
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
10,
121.96701149425286
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2:13b"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "NVIDIA A100-SXM4-40GB",
"data": [
[
0,
79.86888888888889
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA A10G",
"data": [
[
1,
52.69833333333333
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "NVIDIA A40",
"data": [
[
2,
57.23049079754601
]
],
"grouping": false,
"color": "#d20000"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
3,
108.09458823529413
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "NVIDIA L4",
"data": [
[
4,
29.94962962962963
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Tesla P100",
"data": [
[
5,
12.213333333333333
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
6,
12.511621621621622
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla T4",
"data": [
[
7,
26.076999999999998
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla V100-PCIE-16GB",
"data": [
[
8,
67.29761904761905
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "Tesla V100-SXM2-16GB",
"data": [
[
9,
70.5474074074074
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
10,
77.16588235294118
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2:13b"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 450,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama2:70b"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "NVIDIA A100-SXM4-40GB",
"data": [
[
0,
1.7875
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "NVIDIA H100 PCIe",
"data": [
[
1,
30.475540540540543
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "NVIDIA L4",
"data": [
[
2,
1.0558333333333334
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Tesla P100",
"data": [
[
3,
0.4325
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Tesla P100-PCIE-16GB",
"data": [
[
4,
0.7361538461538462
]
],
"grouping": false,
"color": "#510099"
},
{
"name": "Tesla V100S-PCIE-32GB",
"data": [
[
5,
2.4988235294117644
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama2:70b"
}
Cloud Mercato's observations
- As the large model require 32GB of VRAM, the panel is very restricted in this category
- H100 has a performance gap of 30 token/sec