OVHcloud GPU benchmark - Llama 3

Go back to list

Ollama is a cross-platform sofware designed to run large language models (LLMs) locally on any computer. It supports a variety of AI models, including LLaMA-3, Mistral, Falcon, and more. To assess the response speed, we employed the ollama-benchmark tool, measuring throughput in tokens per second. The specific command-line invocation used for our testing is detailed below:

model=llama3:8b  # or llama3:70b
question=81 # or 82, 83, 84 and 85'
ollama-benchmark speed --model $model --question $question_id --max-workers 1 --max_turns 1 --mirostat 0 --mirostat_eta 0.1 --mirostat_tau 5.0 --num_ctx 2048 --repeat_last_n 64 --repeat_penalty 1.1 --temperature 0.8 --seed 0 --tfs_z 1.0 --num_predict 128 --top_k 40 --top_p 0.9

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 550, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama3" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "provider__short_name", "flavor__name", "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "AWS g5.xlarge NVIDIA A10G", "data": [ [ 0, 74.11145583602332 ] ], "grouping": false, "color": "#f7981d" }, { "name": "AWS p3.2xlarge Tesla V100-SXM2-16GB", "data": [ [ 1, 84.25722555865242 ] ], "grouping": false, "color": "#f7981d" }, { "name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL", "data": [ [ 2, 191.02202188374866 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Azure Standard_NC6s_v3 Tesla V100-PCIE-16GB", "data": [ [ 3, 85.9395876676278 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Azure Standard_NV36ads_A10_v5 NVIDIA A10", "data": [ [ 4, 79.82683635781541 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB", "data": [ [ 5, 94.25855110935245 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB", "data": [ [ 6, 98.54227258468484 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google g2-standard-16 NVIDIA L4", "data": [ [ 7, 44.20125112599654 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google n1-highmem-8 Intel Skylake Tesla V100 Tesla V100-SXM2-16GB", "data": [ [ 8, 88.48112862998406 ] ], "grouping": false, "color": "#55b400" }, { "name": "OVH H100-380 NVIDIA H100 PCIe", "data": [ [ 9, 120.23815120370868 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH L4-90 NVIDIA L4", "data": [ [ 10, 48.92874165631959 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH L40S-90 NVIDIA L40S", "data": [ [ 11, 115.28244713897865 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH T1-LE-45 Tesla V100-PCIE-16GB", "data": [ [ 12, 84.11657733154148 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama3" }

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 550, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama3:70b" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "provider__short_name", "flavor__name", "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL", "data": [ [ 0, 40.71168018178036 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB", "data": [ [ 1, 14.015375297515288 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB", "data": [ [ 2, 25.291842674146736 ] ], "grouping": false, "color": "#55b400" }, { "name": "OVH H100-380 NVIDIA H100 PCIe", "data": [ [ 3, 29.7484356028196 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH L40S-90 NVIDIA L40S", "data": [ [ 4, 16.64672521178475 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama3:70b" }

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 550, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama3.1:8b-instruct-q8_0" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "provider__short_name", "flavor__name", "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "AWS g5.xlarge NVIDIA A10G", "data": [ [ 0, 49.45721996577291 ] ], "grouping": false, "color": "#f7981d" }, { "name": "AWS p3.2xlarge Tesla V100-SXM2-16GB", "data": [ [ 1, 61.257833654542665 ] ], "grouping": false, "color": "#f7981d" }, { "name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL", "data": [ [ 2, 160.02328797391598 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Azure Standard_NC6s_v3 Tesla V100-PCIE-16GB", "data": [ [ 3, 63.16613714377257 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB", "data": [ [ 4, 82.24751543850662 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB", "data": [ [ 5, 91.5994552681905 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google g2-standard-16 NVIDIA L4", "data": [ [ 6, 27.006729680994383 ] ], "grouping": false, "color": "#55b400" }, { "name": "Google n1-highmem-8 Intel Skylake Tesla V100 Tesla V100-SXM2-16GB", "data": [ [ 7, 64.7023253930343 ] ], "grouping": false, "color": "#55b400" }, { "name": "OVH A10-45 NVIDIA A10", "data": [ [ 8, 52.23567869632677 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH H100-380 NVIDIA H100 PCIe", "data": [ [ 9, 124.8827190203083 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH L4-90 NVIDIA L4", "data": [ [ 10, 28.934930816750924 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH L40S-90 NVIDIA L40S", "data": [ [ 11, 72.03119918751457 ] ], "grouping": false, "color": "#484848" }, { "name": "OVH T1-45 Tesla V100-PCIE-16GB", "data": [ [ 12, 60.47893212079448 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama3.1:8b-instruct-q8_0" }

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 550, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "Llama3.1:70b-instruct-q8_0" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "provider__short_name", "flavor__name", "flavor__gpu_model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": true, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL", "data": [ [ 0, 27.38188940629102 ] ], "grouping": false, "color": "#55b3ff" }, { "name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB", "data": [ [ 1, 17.472856372685843 ] ], "grouping": false, "color": "#55b400" }, { "name": "OVH H100-380 NVIDIA H100 PCIe", "data": [ [ 2, 21.132689666982362 ] ], "grouping": false, "color": "#484848" } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "llama3.1:70b-instruct-q8_0" }

Cloud Mercato's observations:

Without at least 40GB of VRAM, it's impossible for most VMs to run llama3 70b.
80GB GPUs and especially the H100 are the best options for running large llama3
No matter the size of the model H100 are always the best performer