Add Protobuf standard benchmarks (#122)

britto · web-flow · commit a5cfd4631516 · 2020-07-20T16:11:26.000+03:00
diff --git a/bench/.gitignore b/bench/.gitignore
@@ -2,3 +2,6 @@
 /deps
 erl_crash.dump
 benchmarks
+/data/datasets.tar.gz
+/data/dataset.google_message3*.pb
+/data/dataset.google_message4.pb
diff --git a/bench/README.md b/bench/README.md
@@ -144,6 +144,54 @@ Generated benchmarks/output/encode.html
 Opened report using open
 ```
 
+## Protobuf standard benchmarks
+
+Protobuf includes benchmarks for its official language implementations, such as Python, C++
+and Golang. They measure average encode and decode throughput for each built-in dataset. This
+is useful to check how Elixir matches up with them. You can read more about these benchmarks
+[here](https://github.com/protocolbuffers/protobuf/blob/master/benchmarks/README.md).
+
+To run the standard benchmarks for Elixir, download the datasets then run `standard_bench.exs`.
+
+```console
+$ mix run script/standard_bench.exs
+Message benchmarks.proto2.GoogleMessage1 of dataset file data/dataset.google_message1_proto2.pb
+Average throughput for parse_from_benchmark: 18.48 MB/s
+Average throughput for serialize_to_benchmark: 6.19 MB/s
+
+Message benchmarks.proto3.GoogleMessage1 of dataset file data/dataset.google_message1_proto3.pb
+Average throughput for parse_from_benchmark: 18.4 MB/s
+Average throughput for serialize_to_benchmark: 11.1 MB/s
+
+Message benchmarks.proto2.GoogleMessage2 of dataset file data/dataset.google_message2.pb
+Average throughput for parse_from_benchmark: 47.82 MB/s
+Average throughput for serialize_to_benchmark: 5656.75 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_1.pb
+Average throughput for parse_from_benchmark: 19.94 MB/s
+Average throughput for serialize_to_benchmark: 45.5 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_2.pb
+Average throughput for parse_from_benchmark: 110.65 MB/s
+Average throughput for serialize_to_benchmark: 164.96 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_3.pb
+Average throughput for parse_from_benchmark: 9.8 MB/s
+Average throughput for serialize_to_benchmark: 6.84 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_4.pb
+Average throughput for parse_from_benchmark: 5254.14 MB/s
+Average throughput for serialize_to_benchmark: 737.71 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_5.pb
+Average throughput for parse_from_benchmark: 3.77 MB/s
+Average throughput for serialize_to_benchmark: 3.29 MB/s
+
+Message benchmarks.google_message4.GoogleMessage4 of dataset file data/dataset.google_message4.pb
+Average throughput for parse_from_benchmark: 20.06 MB/s
+Average throughput for serialize_to_benchmark: 32.46 MB/s
+```
+
 ## Contributing
 
 If you have trouble using the downloaded datasets, they might have been upgraded and their
diff --git a/bench/script/standard_bench.exs b/bench/script/standard_bench.exs
@@ -0,0 +1,56 @@
+# Standard benchmark. Its output is compatible with the built-in benchmarks from
+# protobuf for official language implementations, including encoding and decoding
+# throughput on each dataset.
+#
+# Based on Python's implementation:
+# https://github.com/protocolbuffers/protobuf/blob/master/benchmarks/python/py_benchmark.py
+
+single = fn fun, inputs ->
+  Enum.reduce(inputs, 0, fn input, total ->
+    {time, _result} = :timer.tc(fun, [input])
+    total + time
+  end)
+end
+
+repeat = fn fun, inputs, reps ->
+  Enum.reduce(1..reps, 0, fn _, total ->
+    total + single.(fun, inputs)
+  end)
+end
+
+run = fn fun, inputs ->
+  target_run_time = 3_000_000
+  single_run_time = single.(fun, inputs)
+
+  with true <- single_run_time < target_run_time,
+       reps when reps > 1 <- trunc(ceil(target_run_time / single_run_time)) do
+    repeat.(fun, inputs, reps) / reps
+  else
+    _ -> single_run_time
+  end
+end
+
+throughput = fn bytes, microseconds ->
+  megabytes = bytes / 1_048_576
+  seconds = microseconds / 1_000_000
+  Float.round(megabytes / seconds, 2)
+end
+
+for file <- Path.wildcard("data/*.pb") do
+  %{payload: payloads, message_name: mod_name} = ProtoBench.load(file)
+  module = ProtoBench.mod_name(mod_name)
+
+  IO.puts("Message #{mod_name} of dataset file #{file}")
+
+  bytes = Enum.reduce(payloads, 0, &(byte_size(&1) + &2))
+  messages = Enum.map(payloads, &module.decode/1)
+
+  parse = throughput.(bytes, run.(&module.decode/1, payloads))
+
+  IO.puts("Average throughput for parse_from_benchmark: #{parse} MB/s")
+
+  serialize = throughput.(bytes, run.(&module.encode/1, messages))
+
+  IO.puts("Average throughput for serialize_to_benchmark: #{serialize} MB/s")
+  IO.puts("")
+end