jermp
diff --git a/‎.clang-format‎
Lines changed: 151 additions & 0 deletions b/‎.clang-format‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 185 additions & 0 deletions b/‎README.md‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎data/test_collection.docs‎
13.1 MB b/‎data/test_collection.docs‎
13.1 MB
@@ -0,0 +1,151 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeComma
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: true
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+...
+
@@ -0,0 +1,2 @@
+.DS_Store
+build
@@ -0,0 +1,3 @@
+[submodule "external/mm_file"]
+	path = external/mm_file
+	url = https://github.com/jermp/mm_file.git
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 2.8)
+project(BIC)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+endif ()
+
+if(UNIX)
+
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
+
+  if(USE_SANITIZERS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+  endif()
+
+  if(RUNAWARE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DRUNAWARE")
+  endif()
+
+endif()
+
+include_directories(${BIC_SOURCE_DIR}/include)
+
+add_subdirectory(src)
+add_subdirectory(test)
@@ -0,0 +1,185 @@
+Binary Interpolative Coding
+====
+
+A C++ library implementing the *Binary Interpolative Coding* compression algorithm invented by Alistair Moffat and Lang Stuiver [1].
+
+The algorithm can be used to compress sorted integer sequences (here,
+assumed to be increasing).
+
+The implementation comes in different flavours:
+it can be specified the use of
+simple *binary* codes, *left-most minimal* codes and *centered minimal* codes.
+Additionally, the implementation is *run-aware*, i.e.,
+it optimizes encoding/decoding of runs of consecutive identifiers.
+
+##### Table of contents
+* [Compiling the code](#compiling-the-code)
+* [Quick Start](#quick-start)
+* [Encoding/decoding a collection of sequences](#encoding/decoding-a-collection-of-sequences)
+* [Benchmark](#benchmark)
+* [Author](#author)
+* [References](#references)
+
+Compiling the code
+------------------
+
+The code is tested on Linux with `gcc` 7.3.0 and on Mac 10.14 with `clang` 10.0.0.
+To build the code, [`CMake`](https://cmake.org/) is required.
+
+Clone the repository with
+	
+	$ git clone --recursive https://github.com/jermp/interpolative_coding.git
+
+If you have cloned the repository without `--recursive`, you will need to perform the following commands before
+compiling:
+
+    $ git submodule init
+    $ git submodule update
+
+To compile the code for a release environment *and* best performance (see file `CMakeLists.txt` for the used compilation flags), do:
+
+    $ mkdir build
+    $ cd build
+    $ cmake .. -DRUNAWARE=On
+    $ make
+
+Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
+
+For a testing environment, use the following instead:
+
+    $ mkdir debug_build
+    $ cd debug_build
+    $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
+    $ make
+
+Quick Start
+-------
+
+For a quick start, see the source file `test/example.cpp`.
+After compilation, run this example with
+
+	$ ./example
+
+A simpler variation is shown below.
+
+```C++
+#include <iostream>
+
+#include "interpolative_coding.hpp"
+using namespace bic;
+
+template <typename BinaryCode>
+void test(std::vector<uint32_t> const& in) {
+    std::cout << "to be encoded:\n";
+    for (auto x : in) {
+        std::cout << x << " ";
+    }
+    std::cout << std::endl;
+
+    uint32_t n = in.size();
+
+    encoder<typename BinaryCode::writer> enc;
+    enc.encode(in.data(), n);
+
+    std::vector<uint32_t> out(n);
+    decoder<typename BinaryCode::reader> dec;
+    uint32_t m = dec.decode(enc.bits().data(), out.data());
+    assert(m == n);
+
+    std::cout << "decoded " << m << " values" << std::endl;
+    std::cout << "total bits " << enc.num_bits() << std::endl;
+    std::cout << static_cast<double>(enc.num_bits()) / m << " bits x key"
+              << std::endl;
+
+    std::cout << "decoded:\n";
+    for (auto x : out) {
+        std::cout << x << " ";
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        std::cerr << argv[0] << " binary_code_type" << std::endl;
+        return 1;
+    }
+
+    std::vector<uint32_t> in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62};
+
+    std::string type(argv[1]);
+
+    if (type == "binary") {
+        test<binary>(in);
+    } else if (type == "leftmost_minimal") {
+        test<leftmost_minimal>(in);
+    } else if (type == "centered_minimal") {
+        test<centered_minimal>(in);
+    } else {
+        std::cerr << "unknown type '" << type << "'" << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
+```
+
+Encoding/decoding a collection of sequences
+----------------------------------
+
+Typically, we want to build all the sequences from
+a collection.
+In this case, we assume that the input collection
+is a binary file with all the sequences being written
+as 32-bit integers. In this library, we follow the
+input data format of the [`ds2i`](https://github.com/ot/ds2i) library:
+each sequence is prefixed by an additional
+32-bit integer representing the size of the sequence.
+The collection file starts with a singleton sequence
+containing the universe of representation of the sequences, i.e., the maximum representable value.
+
+We also assume all sequences are *increasing*.
+
+The file `data/test_collection.docs` represents an example of
+such organization.
+
+To encode all the sequences from this file, do:
+
+	$ ./encode leftmost_minimal ../data/test_collection.docs -o test.bin
+
+To decode all the sequences from the encoded file `test.bin`, do:
+
+	$ ./decode leftmost_minimal test.bin
+
+To check correctness of the implementation, use:
+
+	$ ./check leftmost_minimal ../data/test_collection.docs test.bin
+
+which will compare every decode integer against the input collection.
+
+Benchmark
+------
+For this benchmark we used the whole Gov2 datasets, containing
+5,742,630,292 integers in 35,636,425 sequences.
+
+We report the average number of bits per integer (bpi)
+and nanoseconds spent per decoded integer (with and without the
+run-aware optimization).
+
+Time measurements were taken using a Linux 4.4.0 server machine with
+an Intel i7-7700 CPU (@3.6 GHz) and 64 GB of RAM.
+The code was compiled with gcc 7.3.0 with all optimizations
+(see also `CMakeLists.txt`).
+
+|**Method**        |**bpi** | **ns/int (run-aware)**  | **ns/int (not run-aware)**|
+|:-----------------|:------:|:-----------------------:|:-------------------------:|
+|simple            |3.532   | 3.45                    | 4.65                      |
+|left-most minimal |3.362   | 5.78                    | 7.07                      |
+|centered minimal  |3.361   | 5.78                    | 7.07                      |
+
+Author
+------
+* [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), <giulio.ermanno.pibiri@isti.cnr.it>
+
+References
+-------
+* [1] Alistair Moffat and Lang Stuiver. 2000. Binary Interpolative Coding for Effective Index Compression. Information Retrieval Journal 3, 1 (2000), 25 – 47.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "external/mm_file"]`
	`2`	`+ path = external/mm_file`
	`3`	`+ url = https://github.com/jermp/mm_file.git`