Ollama benchmark Q2 2024 - Exoscale A40 - LLM Performance
Go back to listAmong the large opensource market of LLMs, we selected the most used models through Ollama and request the following prompt several time for each models:
Make a 1000-words paragraph telling how I am beautiful
From this task, and by enabling the verbosity, we catch:
- eval_rate: Inference speed in token/sec
- prompt_eval_rate: Prompt reading speed token/sec
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 1000,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "eval_rate"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": false,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "codegemma",
"data": [
[
0,
73.63
]
],
"grouping": false
},
{
"name": "codegemma:2b",
"data": [
[
1,
158.54
]
],
"grouping": false
},
{
"name": "codellama",
"data": [
[
2,
97.27
]
],
"grouping": false
},
{
"name": "codellama:13b",
"data": [
[
3,
58.22
]
],
"grouping": false
},
{
"name": "codellama:70b",
"data": [
[
4,
13.33
]
],
"grouping": false
},
{
"name": "deepseek-coder",
"data": [
[
5,
251.64499999999998
]
],
"grouping": false
},
{
"name": "deepseek-coder:33b",
"data": [
[
6,
26.21
]
],
"grouping": false
},
{
"name": "deepseek-coder:6.7b",
"data": [
[
7,
100.005
]
],
"grouping": false
},
{
"name": "dolphin-mixtral",
"data": [
[
8,
52.394999999999996
]
],
"grouping": false
},
{
"name": "gemma",
"data": [
[
9,
77.52714285714286
]
],
"grouping": false
},
{
"name": "gemma:2b",
"data": [
[
10,
148.41285714285712
]
],
"grouping": false
},
{
"name": "llama2",
"data": [
[
11,
96.82666666666667
]
],
"grouping": false
},
{
"name": "llama2-uncensored",
"data": [
[
12,
99.71714285714286
]
],
"grouping": false
},
{
"name": "llama2-uncensored:70b",
"data": [
[
13,
13.274999999999999
]
],
"grouping": false
},
{
"name": "llama2:13b",
"data": [
[
14,
58.041666666666664
]
],
"grouping": false
},
{
"name": "llama2:70b",
"data": [
[
15,
13.288999999999998
]
],
"grouping": false
},
{
"name": "llama3",
"data": [
[
16,
80.26
]
],
"grouping": false
},
{
"name": "llama3:70b",
"data": [
[
17,
12.793333333333331
]
],
"grouping": false
},
{
"name": "llava",
"data": [
[
18,
95.28
]
],
"grouping": false
},
{
"name": "llava:13b",
"data": [
[
19,
58.475
]
],
"grouping": false
},
{
"name": "llava:34b",
"data": [
[
20,
25.02
]
],
"grouping": false
},
{
"name": "mistral",
"data": [
[
21,
92.01285714285714
]
],
"grouping": false
},
{
"name": "mixtral",
"data": [
[
22,
53.06166666666667
]
],
"grouping": false
},
{
"name": "orca-mini",
"data": [
[
23,
155.99285714285716
]
],
"grouping": false
},
{
"name": "orca-mini:13b",
"data": [
[
24,
57.635000000000005
]
],
"grouping": false
},
{
"name": "orca-mini:70b",
"data": [
[
25,
13.125
]
],
"grouping": false
},
{
"name": "orca-mini:7b",
"data": [
[
26,
95.295
]
],
"grouping": false
},
{
"name": "phi3",
"data": [
[
27,
124.10000000000001
]
],
"grouping": false
},
{
"name": "qwen",
"data": [
[
28,
103.12
]
],
"grouping": false
},
{
"name": "qwen:0.5b",
"data": [
[
29,
206.85500000000002
]
],
"grouping": false
},
{
"name": "qwen:1.8b",
"data": [
[
30,
162.41333333333333
]
],
"grouping": false
},
{
"name": "qwen:14b",
"data": [
[
31,
51.19
]
],
"grouping": false
},
{
"name": "qwen:32b",
"data": [
[
32,
24.665
]
],
"grouping": false
},
{
"name": "qwen:72b",
"data": [
[
33,
12.504999999999999
]
],
"grouping": false
},
{
"name": "qwen:7b",
"data": [
[
34,
81.245
]
],
"grouping": false
},
{
"name": "vicuna",
"data": [
[
35,
96.77142857142857
]
],
"grouping": false
},
{
"name": "vicuna:13b",
"data": [
[
36,
57.55
]
],
"grouping": false
},
{
"name": "vicuna:33b",
"data": [
[
37,
25.25
]
],
"grouping": false
},
{
"name": "wizardlm2",
"data": [
[
38,
90.97142857142856
]
],
"grouping": false
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "eval_rate"
}
{
"configuration": {
"chart": {
"type": "bar",
"polar": false,
"zoomType": "",
"options3d": {},
"height": 1000,
"width": null,
"margin": null,
"inverted": false,
"zooming": {}
},
"credits": {
"enabled": false
},
"title": {
"text": "prompt_eval_rate"
},
"colorAxis": null,
"subtitle": {
"text": ""
},
"xAxis": {
"title": {
"text": [
""
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": [
"model"
],
"lineWidth": 1,
"tickInterval": null,
"tickWidth": 0,
"tickLength": 10,
"tickPixelInterval": null,
"plotLines": null,
"labels": {
"enabled": false,
"format": null,
"formatter": "",
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false,
"step": 0
},
"plotBands": null,
"visible": true,
"floor": null,
"ceiling": null,
"type": "linear",
"min": null,
"gridLineWidth": null,
"gridLineColor": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"tickmarkPlacement": null,
"units": null,
"minRange": null
},
"yAxis": {
"title": {
"text": [
"Token/second<br>Higher is better"
],
"useHTML": false,
"style": {
"color": "#666666"
}
},
"categories": null,
"plotLines": null,
"plotBands": null,
"lineWidth": null,
"tickInterval": null,
"tickLength": 10,
"floor": null,
"ceiling": null,
"gridLineInterpolation": null,
"gridLineWidth": 1,
"gridLineColor": "#CCC",
"min": null,
"max": null,
"minorTickInterval": null,
"minorTickWidth": 0,
"minTickInterval": null,
"startOnTick": true,
"endOnTick": null,
"minRange": null,
"type": "linear",
"tickmarkPlacement": null,
"labels": {
"enabled": true,
"formatter": null,
"style": {
"color": "#666666",
"cursor": "default",
"fontSize": "11px"
},
"useHTML": false
}
},
"zAxis": {
"title": {
"text": "Token/second<br>Higher is better"
}
},
"plotOptions": {
"series": {
"dataLabels": {
"enabled": true,
"format": "{series.name}",
"distance": 30,
"align": "left",
"inside": true,
"allowOverlap": false,
"style": {
"fontSize": "17px"
}
},
"showInLegend": null,
"turboThreshold": 1000,
"stacking": "",
"groupPadding": 0,
"centerInCategory": false,
"findNearestPointBy": "x"
}
},
"navigator": {
"enabled": false
},
"scrollbar": {
"enabled": false
},
"rangeSelector": {
"enabled": false,
"inputEnabled": false
},
"legend": {
"enabled": false,
"maxHeight": null,
"align": "center",
"verticalAlign": "bottom",
"layout": "horizontal",
"width": null,
"margin": 12,
"reversed": false
},
"series": [
{
"name": "codegemma",
"data": [
[
0,
222.285
]
],
"grouping": false
},
{
"name": "codegemma:2b",
"data": [
[
1,
315.01
]
],
"grouping": false
},
{
"name": "codellama",
"data": [
[
2,
269.98857142857145
]
],
"grouping": false
},
{
"name": "codellama:13b",
"data": [
[
3,
188.85999999999999
]
],
"grouping": false
},
{
"name": "codellama:70b",
"data": [
[
4,
61.019999999999996
]
],
"grouping": false
},
{
"name": "deepseek-coder",
"data": [
[
5,
649.05
]
],
"grouping": false
},
{
"name": "deepseek-coder:33b",
"data": [
[
6,
75.1
]
],
"grouping": false
},
{
"name": "deepseek-coder:6.7b",
"data": [
[
7,
249.655
]
],
"grouping": false
},
{
"name": "dolphin-mixtral",
"data": [
[
8,
142.885
]
],
"grouping": false
},
{
"name": "gemma",
"data": [
[
9,
222.6457142857143
]
],
"grouping": false
},
{
"name": "gemma:2b",
"data": [
[
10,
522.8314285714285
]
],
"grouping": false
},
{
"name": "llama2",
"data": [
[
11,
252.655
]
],
"grouping": false
},
{
"name": "llama2-uncensored",
"data": [
[
12,
327.15000000000003
]
],
"grouping": false
},
{
"name": "llama2-uncensored:70b",
"data": [
[
13,
56.82
]
],
"grouping": false
},
{
"name": "llama2:13b",
"data": [
[
14,
162.36833333333334
]
],
"grouping": false
},
{
"name": "llama2:70b",
"data": [
[
15,
43.46181818181818
]
],
"grouping": false
},
{
"name": "llama3",
"data": [
[
16,
201.97571428571428
]
],
"grouping": false
},
{
"name": "llama3:70b",
"data": [
[
17,
39.275999999999996
]
],
"grouping": false
},
{
"name": "llava",
"data": [
[
18,
223.115
]
],
"grouping": false
},
{
"name": "llava:13b",
"data": [
[
19,
170.65
]
],
"grouping": false
},
{
"name": "llava:34b",
"data": [
[
20,
79.66499999999999
]
],
"grouping": false
},
{
"name": "mistral",
"data": [
[
21,
222.80714285714288
]
],
"grouping": false
},
{
"name": "mixtral",
"data": [
[
22,
148.10714285714286
]
],
"grouping": false
},
{
"name": "orca-mini",
"data": [
[
23,
477.8228571428572
]
],
"grouping": false
},
{
"name": "orca-mini:13b",
"data": [
[
24,
184.7
]
],
"grouping": false
},
{
"name": "orca-mini:70b",
"data": [
[
25,
50.32
]
],
"grouping": false
},
{
"name": "orca-mini:7b",
"data": [
[
26,
289.18
]
],
"grouping": false
},
{
"name": "phi3",
"data": [
[
27,
439.7728571428571
]
],
"grouping": false
},
{
"name": "qwen",
"data": [
[
28,
309.875
]
],
"grouping": false
},
{
"name": "qwen:0.5b",
"data": [
[
29,
1042.5
]
],
"grouping": false
},
{
"name": "qwen:1.8b",
"data": [
[
30,
599.3766666666667
]
],
"grouping": false
},
{
"name": "qwen:14b",
"data": [
[
31,
156.82
]
],
"grouping": false
},
{
"name": "qwen:32b",
"data": [
[
32,
73.925
]
],
"grouping": false
},
{
"name": "qwen:72b",
"data": [
[
33,
45.53
]
],
"grouping": false
},
{
"name": "qwen:7b",
"data": [
[
34,
242.86
]
],
"grouping": false
},
{
"name": "vicuna",
"data": [
[
35,
265.2714285714286
]
],
"grouping": false
},
{
"name": "vicuna:13b",
"data": [
[
36,
170.42000000000002
]
],
"grouping": false
},
{
"name": "vicuna:33b",
"data": [
[
37,
87.39500000000001
]
],
"grouping": false
},
{
"name": "wizardlm2",
"data": [
[
38,
233.61714285714285
]
],
"grouping": false
}
],
"drilldown": {},
"tooltip": {
"enabled": true,
"useHTML": false,
"format": null,
"headerFormat": "",
"pointFormat": "<span style=\"color:{series.color}\">{series.name}</span>: <b>{point.y:.1f} token/sec</b>",
"footerFormat": "",
"shared": false,
"outside": false,
"valueDecimals": null,
"split": false
},
"annotations": null
},
"hc_type": "chart",
"id": "prompt_eval_rate"
}
Notes
- Not all models are generalistic, some like deepseek-coder or codegemma are especially made to produce code
- This test take mostly appart the capacity to produce a response as quick as possible
- Only model fitting in a NVIDIA A40 (40GB) are tested