Scaleway NVIDIA H100 Performance evaluation - Ollama

Go back to list

Ollama is an open-source tool allowing the usage of Large Language Model (LLM) with different types and size of models. In this study we used the official Llama2 model in its 3 sizes:

  • 7 billions of parameters: 8GB of VRAM
  • 13B: 16GB
  • 70B: 32GB

The number of parameters theorically increase its efficiency but with the counterpart of requiring more memory from GPUs.

As our evalutation isn't about the model efficiency and its capacity to elaborate good answers, we focus on the ability of the system to produce quickly an response in token per seconds. A token being generally a word in a sentence.

Here an example of test invocation:

$ ollama run llama2 --verbose 'Write a 100 word paragraph about weather.'
 
 

 

 

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 450, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama2" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second<br>Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second<br>Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "A100-SXM4-40GB", "data": [ [ 0, 108.43722222222222 ] ], "grouping": false, "color": "#55b400" }, { "name": "A10G", "data": [ [ 1, 84.86333333333333 ] ], "grouping": false, "color": "#f7981d" }, { "name": "L4", "data": [ [ 2, 50.90428571428571 ] ], "grouping": false, "color": "#55b400" }, { "name": "NVIDIA A40", "data": [ [ 3, 94.9889880952381 ] ], "grouping": false, "color": "#d20000" }, { "name": "NVIDIA H100 PCIe", "data": [ [ 4, 160.78670454545454 ] ], "grouping": false, "color": "#510099" }, { "name": "Telsa T4", "data": [ [ 5, 47.31517241379311 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Tesla P100", "data": [ [ 6, 21.73 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Tesla P100-PCIE-16GB", "data": [ [ 7, 22.0405 ] ], "grouping": false, "color": "#510099" }, { "name": "Tesla V100-PCIE-16GB", "data": [ [ 8, 110.15955555555556 ] ], "grouping": false, "color": "#484848" }, { "name": "Tesla V100-SXM2-16GB", "data": [ [ 9, 117.11066666666667 ] ], "grouping": false, "color": "#f7981d" }, { "name": "Tesla V100S-PCIE-32GB", "data": [ [ 10, 121.96701149425286 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "headerFormat": "", "pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama2" }
{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 450, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama2:13b" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second<br>Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second<br>Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "A100-SXM4-40GB", "data": [ [ 0, 79.86888888888889 ] ], "grouping": false, "color": "#55b400" }, { "name": "A10G", "data": [ [ 1, 52.69833333333333 ] ], "grouping": false, "color": "#f7981d" }, { "name": "L4", "data": [ [ 2, 29.94962962962963 ] ], "grouping": false, "color": "#55b400" }, { "name": "NVIDIA A40", "data": [ [ 3, 57.23049079754601 ] ], "grouping": false, "color": "#d20000" }, { "name": "NVIDIA H100 PCIe", "data": [ [ 4, 108.09458823529413 ] ], "grouping": false, "color": "#510099" }, { "name": "Telsa T4", "data": [ [ 5, 26.076999999999998 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Tesla P100", "data": [ [ 6, 12.213333333333333 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Tesla P100-PCIE-16GB", "data": [ [ 7, 12.511621621621622 ] ], "grouping": false, "color": "#510099" }, { "name": "Tesla V100-PCIE-16GB", "data": [ [ 8, 67.29761904761905 ] ], "grouping": false, "color": "#484848" }, { "name": "Tesla V100-SXM2-16GB", "data": [ [ 9, 70.5474074074074 ] ], "grouping": false, "color": "#f7981d" }, { "name": "Tesla V100S-PCIE-32GB", "data": [ [ 10, 77.16588235294118 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "headerFormat": "", "pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama2:13b" }
{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 450, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama2:70b" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second<br>Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second<br>Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "A100-SXM4-40GB", "data": [ [ 0, 1.7875 ] ], "grouping": false, "color": "#55b400" }, { "name": "L4", "data": [ [ 1, 1.0558333333333334 ] ], "grouping": false, "color": "#55b400" }, { "name": "NVIDIA H100 PCIe", "data": [ [ 2, 30.475540540540543 ] ], "grouping": false, "color": "#510099" }, { "name": "Tesla P100", "data": [ [ 3, 0.4325 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Tesla P100-PCIE-16GB", "data": [ [ 4, 0.7361538461538462 ] ], "grouping": false, "color": "#510099" }, { "name": "Tesla V100S-PCIE-32GB", "data": [ [ 5, 2.4988235294117644 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "headerFormat": "", "pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama2:70b" }

Cloud Mercato's observations

  • As the large model require 32GB of VRAM, the panel is very restricted in this category
  • H100 has a performance gap of 30 token/sec