Ollama benchmark Q2 2024 - Exoscale A40 - LLM Performance

Filter:

GPU model

Model

Among the large opensource market of LLMs, we selected the most used models through Ollama and request the following prompt several time for each models:

Make a 1000-words paragraph telling how I am beautiful

From this task, and by enabling the verbosity, we catch:

eval_rate: Inference speed in token/sec
prompt_eval_rate: Prompt reading speed token/sec

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 1000, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "eval_rate" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": false, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "codegemma", "data": [ [ 0, 73.63 ] ], "grouping": false }, { "name": "codegemma:2b", "data": [ [ 1, 158.54 ] ], "grouping": false }, { "name": "codellama", "data": [ [ 2, 97.27 ] ], "grouping": false }, { "name": "codellama:13b", "data": [ [ 3, 58.22 ] ], "grouping": false }, { "name": "codellama:70b", "data": [ [ 4, 13.33 ] ], "grouping": false }, { "name": "deepseek-coder", "data": [ [ 5, 251.64499999999998 ] ], "grouping": false }, { "name": "deepseek-coder:33b", "data": [ [ 6, 26.21 ] ], "grouping": false }, { "name": "deepseek-coder:6.7b", "data": [ [ 7, 100.005 ] ], "grouping": false }, { "name": "dolphin-mixtral", "data": [ [ 8, 52.394999999999996 ] ], "grouping": false }, { "name": "gemma", "data": [ [ 9, 77.52714285714286 ] ], "grouping": false }, { "name": "gemma:2b", "data": [ [ 10, 148.41285714285712 ] ], "grouping": false }, { "name": "llama2", "data": [ [ 11, 96.82666666666667 ] ], "grouping": false }, { "name": "llama2-uncensored", "data": [ [ 12, 99.71714285714286 ] ], "grouping": false }, { "name": "llama2-uncensored:70b", "data": [ [ 13, 13.274999999999999 ] ], "grouping": false }, { "name": "llama2:13b", "data": [ [ 14, 58.041666666666664 ] ], "grouping": false }, { "name": "llama2:70b", "data": [ [ 15, 13.288999999999998 ] ], "grouping": false }, { "name": "llama3", "data": [ [ 16, 80.26 ] ], "grouping": false }, { "name": "llama3:70b", "data": [ [ 17, 12.793333333333331 ] ], "grouping": false }, { "name": "llava", "data": [ [ 18, 95.28 ] ], "grouping": false }, { "name": "llava:13b", "data": [ [ 19, 58.475 ] ], "grouping": false }, { "name": "llava:34b", "data": [ [ 20, 25.02 ] ], "grouping": false }, { "name": "mistral", "data": [ [ 21, 92.01285714285714 ] ], "grouping": false }, { "name": "mixtral", "data": [ [ 22, 53.06166666666667 ] ], "grouping": false }, { "name": "orca-mini", "data": [ [ 23, 155.99285714285716 ] ], "grouping": false }, { "name": "orca-mini:13b", "data": [ [ 24, 57.635000000000005 ] ], "grouping": false }, { "name": "orca-mini:70b", "data": [ [ 25, 13.125 ] ], "grouping": false }, { "name": "orca-mini:7b", "data": [ [ 26, 95.295 ] ], "grouping": false }, { "name": "phi3", "data": [ [ 27, 124.10000000000001 ] ], "grouping": false }, { "name": "qwen", "data": [ [ 28, 103.12 ] ], "grouping": false }, { "name": "qwen:0.5b", "data": [ [ 29, 206.85500000000002 ] ], "grouping": false }, { "name": "qwen:1.8b", "data": [ [ 30, 162.41333333333333 ] ], "grouping": false }, { "name": "qwen:14b", "data": [ [ 31, 51.19 ] ], "grouping": false }, { "name": "qwen:32b", "data": [ [ 32, 24.665 ] ], "grouping": false }, { "name": "qwen:72b", "data": [ [ 33, 12.504999999999999 ] ], "grouping": false }, { "name": "qwen:7b", "data": [ [ 34, 81.245 ] ], "grouping": false }, { "name": "vicuna", "data": [ [ 35, 96.77142857142857 ] ], "grouping": false }, { "name": "vicuna:13b", "data": [ [ 36, 57.55 ] ], "grouping": false }, { "name": "vicuna:33b", "data": [ [ 37, 25.25 ] ], "grouping": false }, { "name": "wizardlm2", "data": [ [ 38, 90.97142857142856 ] ], "grouping": false } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "eval_rate" }

{ "configuration": { "chart": { "type": "bar", "polar": false, "zoomType": "", "options3d": {}, "height": 1000, "width": null, "margin": null, "inverted": false, "zooming": {} }, "credits": { "enabled": false }, "title": { "text": "prompt_eval_rate" }, "colorAxis": null, "subtitle": { "text": "" }, "xAxis": { "title": { "text": [ "" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": [ "model" ], "lineWidth": 1, "tickInterval": null, "tickWidth": 0, "tickLength": 10, "tickPixelInterval": null, "plotLines": null, "labels": { "enabled": false, "format": null, "formatter": "", "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false, "step": 0 }, "plotBands": null, "visible": true, "floor": null, "ceiling": null, "type": "linear", "min": null, "gridLineWidth": null, "gridLineColor": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "tickmarkPlacement": null, "units": null, "minRange": null }, "yAxis": { "title": { "text": [ "Token/second Higher is better" ], "useHTML": false, "style": { "color": "#666666" } }, "categories": null, "plotLines": null, "plotBands": null, "lineWidth": null, "tickInterval": null, "tickLength": 10, "floor": null, "ceiling": null, "gridLineInterpolation": null, "gridLineWidth": 1, "gridLineColor": "#CCC", "min": null, "max": null, "minorTickInterval": null, "minorTickWidth": 0, "minTickInterval": null, "startOnTick": true, "endOnTick": null, "minRange": null, "type": "linear", "tickmarkPlacement": null, "labels": { "enabled": true, "formatter": null, "style": { "color": "#666666", "cursor": "default", "fontSize": "11px" }, "useHTML": false } }, "zAxis": { "title": { "text": "Token/second Higher is better" } }, "plotOptions": { "series": { "dataLabels": { "enabled": true, "format": "{series.name}", "distance": 30, "align": "left", "inside": true, "allowOverlap": false, "style": { "fontSize": "17px" } }, "showInLegend": null, "turboThreshold": 1000, "stacking": "", "groupPadding": 0, "centerInCategory": false, "findNearestPointBy": "x" } }, "navigator": { "enabled": false }, "scrollbar": { "enabled": false }, "rangeSelector": { "enabled": false, "inputEnabled": false }, "legend": { "enabled": false, "maxHeight": null, "align": "center", "verticalAlign": "bottom", "layout": "horizontal", "width": null, "margin": 12, "reversed": false }, "series": [ { "name": "codegemma", "data": [ [ 0, 222.285 ] ], "grouping": false }, { "name": "codegemma:2b", "data": [ [ 1, 315.01 ] ], "grouping": false }, { "name": "codellama", "data": [ [ 2, 269.98857142857145 ] ], "grouping": false }, { "name": "codellama:13b", "data": [ [ 3, 188.85999999999999 ] ], "grouping": false }, { "name": "codellama:70b", "data": [ [ 4, 61.019999999999996 ] ], "grouping": false }, { "name": "deepseek-coder", "data": [ [ 5, 649.05 ] ], "grouping": false }, { "name": "deepseek-coder:33b", "data": [ [ 6, 75.1 ] ], "grouping": false }, { "name": "deepseek-coder:6.7b", "data": [ [ 7, 249.655 ] ], "grouping": false }, { "name": "dolphin-mixtral", "data": [ [ 8, 142.885 ] ], "grouping": false }, { "name": "gemma", "data": [ [ 9, 222.6457142857143 ] ], "grouping": false }, { "name": "gemma:2b", "data": [ [ 10, 522.8314285714285 ] ], "grouping": false }, { "name": "llama2", "data": [ [ 11, 252.655 ] ], "grouping": false }, { "name": "llama2-uncensored", "data": [ [ 12, 327.15000000000003 ] ], "grouping": false }, { "name": "llama2-uncensored:70b", "data": [ [ 13, 56.82 ] ], "grouping": false }, { "name": "llama2:13b", "data": [ [ 14, 162.36833333333334 ] ], "grouping": false }, { "name": "llama2:70b", "data": [ [ 15, 43.46181818181818 ] ], "grouping": false }, { "name": "llama3", "data": [ [ 16, 201.97571428571428 ] ], "grouping": false }, { "name": "llama3:70b", "data": [ [ 17, 39.275999999999996 ] ], "grouping": false }, { "name": "llava", "data": [ [ 18, 223.115 ] ], "grouping": false }, { "name": "llava:13b", "data": [ [ 19, 170.65 ] ], "grouping": false }, { "name": "llava:34b", "data": [ [ 20, 79.66499999999999 ] ], "grouping": false }, { "name": "mistral", "data": [ [ 21, 222.80714285714288 ] ], "grouping": false }, { "name": "mixtral", "data": [ [ 22, 148.10714285714286 ] ], "grouping": false }, { "name": "orca-mini", "data": [ [ 23, 477.8228571428572 ] ], "grouping": false }, { "name": "orca-mini:13b", "data": [ [ 24, 184.7 ] ], "grouping": false }, { "name": "orca-mini:70b", "data": [ [ 25, 50.32 ] ], "grouping": false }, { "name": "orca-mini:7b", "data": [ [ 26, 289.18 ] ], "grouping": false }, { "name": "phi3", "data": [ [ 27, 439.7728571428571 ] ], "grouping": false }, { "name": "qwen", "data": [ [ 28, 309.875 ] ], "grouping": false }, { "name": "qwen:0.5b", "data": [ [ 29, 1042.5 ] ], "grouping": false }, { "name": "qwen:1.8b", "data": [ [ 30, 599.3766666666667 ] ], "grouping": false }, { "name": "qwen:14b", "data": [ [ 31, 156.82 ] ], "grouping": false }, { "name": "qwen:32b", "data": [ [ 32, 73.925 ] ], "grouping": false }, { "name": "qwen:72b", "data": [ [ 33, 45.53 ] ], "grouping": false }, { "name": "qwen:7b", "data": [ [ 34, 242.86 ] ], "grouping": false }, { "name": "vicuna", "data": [ [ 35, 265.2714285714286 ] ], "grouping": false }, { "name": "vicuna:13b", "data": [ [ 36, 170.42000000000002 ] ], "grouping": false }, { "name": "vicuna:33b", "data": [ [ 37, 87.39500000000001 ] ], "grouping": false }, { "name": "wizardlm2", "data": [ [ 38, 233.61714285714285 ] ], "grouping": false } ], "drilldown": {}, "tooltip": { "enabled": true, "useHTML": false, "format": null, "headerFormat": "", "pointFormat": "{series.name}: {point.y:.1f} token/sec", "footerFormat": "", "shared": false, "outside": false, "valueDecimals": null, "split": false }, "annotations": null }, "hc_type": "chart", "id": "prompt_eval_rate" }

Notes

Not all models are generalistic, some like deepseek-coder or codegemma are especially made to produce code
This test take mostly appart the capacity to produce a response as quick as possible
Only model fitting in a NVIDIA A40 (40GB) are tested