Skip to content

Commit 3ce2760

Browse files
committed
first commit
0 parents  commit 3ce2760

File tree

19 files changed

+1480
-0
lines changed

19 files changed

+1480
-0
lines changed

.clang-format

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
---
2+
Language: Cpp
3+
# BasedOnStyle: Google
4+
AccessModifierOffset: -4
5+
AlignAfterOpenBracket: Align
6+
AlignConsecutiveAssignments: false
7+
AlignConsecutiveDeclarations: false
8+
AlignEscapedNewlines: Left
9+
AlignOperands: true
10+
AlignTrailingComments: true
11+
AllowAllParametersOfDeclarationOnNextLine: true
12+
AllowShortBlocksOnASingleLine: false
13+
AllowShortCaseLabelsOnASingleLine: false
14+
AllowShortFunctionsOnASingleLine: Empty
15+
AllowShortIfStatementsOnASingleLine: true
16+
AllowShortLoopsOnASingleLine: true
17+
AlwaysBreakAfterDefinitionReturnType: None
18+
AlwaysBreakAfterReturnType: None
19+
AlwaysBreakBeforeMultilineStrings: true
20+
AlwaysBreakTemplateDeclarations: Yes
21+
BinPackArguments: true
22+
BinPackParameters: true
23+
BraceWrapping:
24+
AfterClass: false
25+
AfterControlStatement: false
26+
AfterEnum: false
27+
AfterFunction: false
28+
AfterNamespace: false
29+
AfterObjCDeclaration: false
30+
AfterStruct: false
31+
AfterUnion: false
32+
AfterExternBlock: false
33+
BeforeCatch: false
34+
BeforeElse: false
35+
IndentBraces: false
36+
SplitEmptyFunction: true
37+
SplitEmptyRecord: true
38+
SplitEmptyNamespace: true
39+
BreakBeforeBinaryOperators: None
40+
BreakBeforeBraces: Attach
41+
BreakBeforeInheritanceComma: false
42+
BreakInheritanceList: BeforeComma
43+
BreakBeforeTernaryOperators: true
44+
BreakConstructorInitializersBeforeComma: true
45+
BreakConstructorInitializers: BeforeComma
46+
BreakAfterJavaFieldAnnotations: false
47+
BreakStringLiterals: true
48+
ColumnLimit: 80
49+
CommentPragmas: '^ IWYU pragma:'
50+
CompactNamespaces: false
51+
ConstructorInitializerAllOnOneLineOrOnePerLine: true
52+
ConstructorInitializerIndentWidth: 4
53+
ContinuationIndentWidth: 4
54+
Cpp11BracedListStyle: true
55+
DerivePointerAlignment: false
56+
DisableFormat: false
57+
ExperimentalAutoDetectBinPacking: false
58+
FixNamespaceComments: true
59+
ForEachMacros:
60+
- foreach
61+
- Q_FOREACH
62+
- BOOST_FOREACH
63+
IncludeBlocks: Preserve
64+
IncludeCategories:
65+
- Regex: '^<ext/.*\.h>'
66+
Priority: 2
67+
- Regex: '^<.*\.h>'
68+
Priority: 1
69+
- Regex: '^<.*'
70+
Priority: 2
71+
- Regex: '.*'
72+
Priority: 3
73+
IncludeIsMainRegex: '([-_](test|unittest))?$'
74+
IndentCaseLabels: true
75+
IndentPPDirectives: None
76+
IndentWidth: 4
77+
IndentWrappedFunctionNames: false
78+
JavaScriptQuotes: Leave
79+
JavaScriptWrapImports: true
80+
KeepEmptyLinesAtTheStartOfBlocks: false
81+
MacroBlockBegin: ''
82+
MacroBlockEnd: ''
83+
MaxEmptyLinesToKeep: 1
84+
NamespaceIndentation: None
85+
ObjCBinPackProtocolList: Never
86+
ObjCBlockIndentWidth: 2
87+
ObjCSpaceAfterProperty: false
88+
ObjCSpaceBeforeProtocolList: true
89+
PenaltyBreakAssignment: 2
90+
PenaltyBreakBeforeFirstCallParameter: 1
91+
PenaltyBreakComment: 300
92+
PenaltyBreakFirstLessLess: 120
93+
PenaltyBreakString: 1000
94+
PenaltyBreakTemplateDeclaration: 10
95+
PenaltyExcessCharacter: 1000000
96+
PenaltyReturnTypeOnItsOwnLine: 200
97+
PointerAlignment: Left
98+
RawStringFormats:
99+
- Language: Cpp
100+
Delimiters:
101+
- cc
102+
- CC
103+
- cpp
104+
- Cpp
105+
- CPP
106+
- 'c++'
107+
- 'C++'
108+
CanonicalDelimiter: ''
109+
BasedOnStyle: google
110+
- Language: TextProto
111+
Delimiters:
112+
- pb
113+
- PB
114+
- proto
115+
- PROTO
116+
EnclosingFunctions:
117+
- EqualsProto
118+
- EquivToProto
119+
- PARSE_PARTIAL_TEXT_PROTO
120+
- PARSE_TEST_PROTO
121+
- PARSE_TEXT_PROTO
122+
- ParseTextOrDie
123+
- ParseTextProtoOrDie
124+
CanonicalDelimiter: ''
125+
BasedOnStyle: google
126+
ReflowComments: true
127+
SortIncludes: false
128+
SortUsingDeclarations: false
129+
SpaceAfterCStyleCast: false
130+
SpaceAfterTemplateKeyword: true
131+
SpaceBeforeAssignmentOperators: true
132+
SpaceBeforeCpp11BracedList: false
133+
SpaceBeforeCtorInitializerColon: true
134+
SpaceBeforeInheritanceColon: true
135+
SpaceBeforeParens: ControlStatements
136+
SpaceBeforeRangeBasedForLoopColon: true
137+
SpaceInEmptyParentheses: false
138+
SpacesBeforeTrailingComments: 2
139+
SpacesInAngles: false
140+
SpacesInContainerLiterals: true
141+
SpacesInCStyleCastParentheses: false
142+
SpacesInParentheses: false
143+
SpacesInSquareBrackets: false
144+
Standard: Auto
145+
StatementMacros:
146+
- Q_UNUSED
147+
- QT_REQUIRE_VERSION
148+
TabWidth: 8
149+
UseTab: Never
150+
...
151+

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.DS_Store
2+
build

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "external/mm_file"]
2+
path = external/mm_file
3+
url = https://github.com/jermp/mm_file.git

CMakeLists.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
cmake_minimum_required(VERSION 2.8)
2+
project(BIC)
3+
4+
if(NOT CMAKE_BUILD_TYPE)
5+
set(CMAKE_BUILD_TYPE "Release")
6+
endif()
7+
MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
8+
9+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
10+
11+
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
12+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
13+
endif ()
14+
15+
if(UNIX)
16+
17+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
18+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
19+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
20+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
21+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
22+
23+
if(USE_SANITIZERS)
24+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
25+
endif()
26+
27+
if(RUNAWARE)
28+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DRUNAWARE")
29+
endif()
30+
31+
endif()
32+
33+
include_directories(${BIC_SOURCE_DIR}/include)
34+
35+
add_subdirectory(src)
36+
add_subdirectory(test)

README.md

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
Binary Interpolative Coding
2+
====
3+
4+
A C++ library implementing the *Binary Interpolative Coding* compression algorithm invented by Alistair Moffat and Lang Stuiver [1].
5+
6+
The algorithm can be used to compress sorted integer sequences (here,
7+
assumed to be increasing).
8+
9+
The implementation comes in different flavours:
10+
it can be specified the use of
11+
simple *binary* codes, *left-most minimal* codes and *centered minimal* codes.
12+
Additionally, the implementation is *run-aware*, i.e.,
13+
it optimizes encoding/decoding of runs of consecutive identifiers.
14+
15+
##### Table of contents
16+
* [Compiling the code](#compiling-the-code)
17+
* [Quick Start](#quick-start)
18+
* [Encoding/decoding a collection of sequences](#encoding/decoding-a-collection-of-sequences)
19+
* [Benchmark](#benchmark)
20+
* [Author](#author)
21+
* [References](#references)
22+
23+
Compiling the code
24+
------------------
25+
26+
The code is tested on Linux with `gcc` 7.3.0 and on Mac 10.14 with `clang` 10.0.0.
27+
To build the code, [`CMake`](https://cmake.org/) is required.
28+
29+
Clone the repository with
30+
31+
$ git clone --recursive https://github.com/jermp/interpolative_coding.git
32+
33+
If you have cloned the repository without `--recursive`, you will need to perform the following commands before
34+
compiling:
35+
36+
$ git submodule init
37+
$ git submodule update
38+
39+
To compile the code for a release environment *and* best performance (see file `CMakeLists.txt` for the used compilation flags), do:
40+
41+
$ mkdir build
42+
$ cd build
43+
$ cmake .. -DRUNAWARE=On
44+
$ make
45+
46+
Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
47+
48+
For a testing environment, use the following instead:
49+
50+
$ mkdir debug_build
51+
$ cd debug_build
52+
$ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
53+
$ make
54+
55+
Quick Start
56+
-------
57+
58+
For a quick start, see the source file `test/example.cpp`.
59+
After compilation, run this example with
60+
61+
$ ./example
62+
63+
A simpler variation is shown below.
64+
65+
```C++
66+
#include <iostream>
67+
68+
#include "interpolative_coding.hpp"
69+
using namespace bic;
70+
71+
template <typename BinaryCode>
72+
void test(std::vector<uint32_t> const& in) {
73+
std::cout << "to be encoded:\n";
74+
for (auto x : in) {
75+
std::cout << x << " ";
76+
}
77+
std::cout << std::endl;
78+
79+
uint32_t n = in.size();
80+
81+
encoder<typename BinaryCode::writer> enc;
82+
enc.encode(in.data(), n);
83+
84+
std::vector<uint32_t> out(n);
85+
decoder<typename BinaryCode::reader> dec;
86+
uint32_t m = dec.decode(enc.bits().data(), out.data());
87+
assert(m == n);
88+
89+
std::cout << "decoded " << m << " values" << std::endl;
90+
std::cout << "total bits " << enc.num_bits() << std::endl;
91+
std::cout << static_cast<double>(enc.num_bits()) / m << " bits x key"
92+
<< std::endl;
93+
94+
std::cout << "decoded:\n";
95+
for (auto x : out) {
96+
std::cout << x << " ";
97+
}
98+
std::cout << std::endl;
99+
}
100+
101+
int main(int argc, char** argv) {
102+
if (argc < 2) {
103+
std::cerr << argv[0] << " binary_code_type" << std::endl;
104+
return 1;
105+
}
106+
107+
std::vector<uint32_t> in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62};
108+
109+
std::string type(argv[1]);
110+
111+
if (type == "binary") {
112+
test<binary>(in);
113+
} else if (type == "leftmost_minimal") {
114+
test<leftmost_minimal>(in);
115+
} else if (type == "centered_minimal") {
116+
test<centered_minimal>(in);
117+
} else {
118+
std::cerr << "unknown type '" << type << "'" << std::endl;
119+
return 1;
120+
}
121+
122+
return 0;
123+
}
124+
```
125+
126+
Encoding/decoding a collection of sequences
127+
----------------------------------
128+
129+
Typically, we want to build all the sequences from
130+
a collection.
131+
In this case, we assume that the input collection
132+
is a binary file with all the sequences being written
133+
as 32-bit integers. In this library, we follow the
134+
input data format of the [`ds2i`](https://github.com/ot/ds2i) library:
135+
each sequence is prefixed by an additional
136+
32-bit integer representing the size of the sequence.
137+
The collection file starts with a singleton sequence
138+
containing the universe of representation of the sequences, i.e., the maximum representable value.
139+
140+
We also assume all sequences are *increasing*.
141+
142+
The file `data/test_collection.docs` represents an example of
143+
such organization.
144+
145+
To encode all the sequences from this file, do:
146+
147+
$ ./encode leftmost_minimal ../data/test_collection.docs -o test.bin
148+
149+
To decode all the sequences from the encoded file `test.bin`, do:
150+
151+
$ ./decode leftmost_minimal test.bin
152+
153+
To check correctness of the implementation, use:
154+
155+
$ ./check leftmost_minimal ../data/test_collection.docs test.bin
156+
157+
which will compare every decode integer against the input collection.
158+
159+
Benchmark
160+
------
161+
For this benchmark we used the whole Gov2 datasets, containing
162+
5,742,630,292 integers in 35,636,425 sequences.
163+
164+
We report the average number of bits per integer (bpi)
165+
and nanoseconds spent per decoded integer (with and without the
166+
run-aware optimization).
167+
168+
Time measurements were taken using a Linux 4.4.0 server machine with
169+
an Intel i7-7700 CPU (@3.6 GHz) and 64 GB of RAM.
170+
The code was compiled with gcc 7.3.0 with all optimizations
171+
(see also `CMakeLists.txt`).
172+
173+
|**Method** |**bpi** | **ns/int (run-aware)** | **ns/int (not run-aware)**|
174+
|:-----------------|:------:|:-----------------------:|:-------------------------:|
175+
|simple |3.532 | 3.45 | 4.65 |
176+
|left-most minimal |3.362 | 5.78 | 7.07 |
177+
|centered minimal |3.361 | 5.78 | 7.07 |
178+
179+
Author
180+
------
181+
* [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), <giulio.ermanno.pibiri@isti.cnr.it>
182+
183+
References
184+
-------
185+
* [1] Alistair Moffat and Lang Stuiver. 2000. Binary Interpolative Coding for Effective Index Compression. Information Retrieval Journal 3, 1 (2000), 25 – 47.

data/test_collection.docs

13.1 MB
Binary file not shown.

0 commit comments

Comments
 (0)