OVHcloud GPU benchmark - Llama 3
Go back to listOllama is a cross-platform sofware designed to run large language models (LLMs) locally on any computer. It supports a variety of AI models, including LLaMA-3, Mistral, Falcon, and more. To assess the response speed, we employed the ollama-benchmark tool, measuring throughput in tokens per second. The specific command-line invocation used for our testing is detailed below:
model=llama3:8b # or llama3:70b
question=81 # or 82, 83, 84 and 85'
ollama-benchmark speed --model $model --question $question_id --max-workers 1 --max_turns 1 --mirostat 0 --mirostat_eta 0.1 --mirostat_tau 5.0 --num_ctx 2048 --repeat_last_n 64 --repeat_penalty 1.1 --temperature 0.8 --seed 0 --tfs_z 1.0 --num_predict 128 --top_k 40 --top_p 0.9
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 550,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama3"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"provider__short_name",
"flavor__name",
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "AWS g5.xlarge NVIDIA A10G",
"data": [
[
0,
74.11145583602332
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "AWS p3.2xlarge Tesla V100-SXM2-16GB",
"data": [
[
1,
84.25722555865242
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL",
"data": [
[
2,
191.02202188374866
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Azure Standard_NC6s_v3 Tesla V100-PCIE-16GB",
"data": [
[
3,
85.9395876676278
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Azure Standard_NV36ads_A10_v5 NVIDIA A10",
"data": [
[
4,
79.82683635781541
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB",
"data": [
[
5,
94.25855110935245
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB",
"data": [
[
6,
98.54227258468484
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google g2-standard-16 NVIDIA L4",
"data": [
[
7,
44.20125112599654
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google n1-highmem-8 Intel Skylake Tesla V100 Tesla V100-SXM2-16GB",
"data": [
[
8,
88.48112862998406
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "OVH H100-380 NVIDIA H100 PCIe",
"data": [
[
9,
120.23815120370868
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH L4-90 NVIDIA L4",
"data": [
[
10,
48.92874165631959
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH L40S-90 NVIDIA L40S",
"data": [
[
11,
115.28244713897865
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH T1-LE-45 Tesla V100-PCIE-16GB",
"data": [
[
12,
84.11657733154148
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama3"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 550,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama3:70b"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"provider__short_name",
"flavor__name",
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL",
"data": [
[
0,
40.71168018178036
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB",
"data": [
[
1,
14.015375297515288
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB",
"data": [
[
2,
25.291842674146736
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "OVH H100-380 NVIDIA H100 PCIe",
"data": [
[
3,
29.7484356028196
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH L40S-90 NVIDIA L40S",
"data": [
[
4,
16.64672521178475
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama3:70b"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 550,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama3.1:8b-instruct-q8_0"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"provider__short_name",
"flavor__name",
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "AWS g5.xlarge NVIDIA A10G",
"data": [
[
0,
49.45721996577291
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "AWS p3.2xlarge Tesla V100-SXM2-16GB",
"data": [
[
1,
61.257833654542665
]
],
"grouping": false,
"color": "#f7981d"
},
{
"name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL",
"data": [
[
2,
160.02328797391598
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Azure Standard_NC6s_v3 Tesla V100-PCIE-16GB",
"data": [
[
3,
63.16613714377257
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Google a2-highgpu-1g NVIDIA A100-SXM4-40GB",
"data": [
[
4,
82.24751543850662
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB",
"data": [
[
5,
91.5994552681905
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google g2-standard-16 NVIDIA L4",
"data": [
[
6,
27.006729680994383
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "Google n1-highmem-8 Intel Skylake Tesla V100 Tesla V100-SXM2-16GB",
"data": [
[
7,
64.7023253930343
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "OVH A10-45 NVIDIA A10",
"data": [
[
8,
52.23567869632677
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH H100-380 NVIDIA H100 PCIe",
"data": [
[
9,
124.8827190203083
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH L4-90 NVIDIA L4",
"data": [
[
10,
28.934930816750924
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH L40S-90 NVIDIA L40S",
"data": [
[
11,
72.03119918751457
]
],
"grouping": false,
"color": "#484848"
},
{
"name": "OVH T1-45 Tesla V100-PCIE-16GB",
"data": [
[
12,
60.47893212079448
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama3.1:8b-instruct-q8_0"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 550,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "Llama3.1:70b-instruct-q8_0"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"provider__short_name",
"flavor__name",
"flavor__gpu_model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": true,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "Azure Standard_NC40ads_H100_v5 NVIDIA H100 NVL",
"data": [
[
0,
27.38188940629102
]
],
"grouping": false,
"color": "#55b3ff"
},
{
"name": "Google a2-ultragpu-1g NVIDIA A100-SXM4-80GB",
"data": [
[
1,
17.472856372685843
]
],
"grouping": false,
"color": "#55b400"
},
{
"name": "OVH H100-380 NVIDIA H100 PCIe",
"data": [
[
2,
21.132689666982362
]
],
"grouping": false,
"color": "#484848"
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "llama3.1:70b-instruct-q8_0"
}
Cloud Mercato's observations:
- Without at least 40GB of VRAM, it's impossible for most VMs to run llama3 70b.
- 80GB GPUs and especially the H100 are the best options for running large llama3
- No matter the size of the model H100 are always the best performer