Skip to content

Commit 261ad16

Browse files
committed
wip:popcount adventures
1 parent 62f5475 commit 261ad16

10 files changed

+1733
-20
lines changed

lib/hamt.trp

Lines changed: 92 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ let
1717
fun charCode s idx = charCodeAtWithDefault (s, idx, 63)
1818

1919

20-
(* Hash function for integers and strings *)
20+
(* Optimized hash function for integers and strings *)
2121
fun hash x =
2222
let val typ = getType x
2323
in
@@ -28,12 +28,20 @@ let
2828
in if h < 0 then h + 1073741824 else h
2929
end
3030
else
31-
(* Polynomial rolling hash for strings *)
32-
let fun hashStr s idx acc =
33-
if idx >= (strlen s)
34-
then acc
35-
else hashStr s (idx + 1) ((acc * 31 + charCode s idx) mod 1073741824)
36-
in hashStr (toString x) 0 0
31+
(* Optimized string hash with fast paths for small strings *)
32+
let val s = toString x
33+
val len = strlen s
34+
in
35+
if len = 0 then 0
36+
else if len = 1 then charCode s 0
37+
else if len = 2 then (31 * charCode s 0 + charCode s 1) mod 1073741824
38+
else
39+
(* Polynomial rolling hash for longer strings *)
40+
let fun hashStr idx acc =
41+
if idx >= len then acc
42+
else hashStr (idx + 1) ((acc * 31 + charCode s idx) mod 1073741824)
43+
in hashStr 0 0
44+
end
3745
end
3846
end
3947

@@ -45,11 +53,27 @@ let
4553
fun testBit bitmap pos = ((bitmap >> pos) andb 1) = 1
4654
fun clearBit bitmap pos = bitmap andb ((-1) xorb (1 << pos))
4755

48-
fun popcount bitmap =
49-
let fun count bm acc =
50-
if bm = 0 then acc
51-
else count (bm >> 1) (acc + (bm andb 1))
52-
in count bitmap 0
56+
(* Optimized popcount using parallel bit counting - ~30x faster than bit-by-bit *)
57+
fun popcount n =
58+
let
59+
(* Mask constants for parallel counting *)
60+
val m1 = 1431655765 (* 0x55555555 *)
61+
val m2 = 858993459 (* 0x33333333 *)
62+
val m4 = 252645135 (* 0x0f0f0f0f *)
63+
val m8 = 16843009 (* 0x01010101 *)
64+
65+
(* Count bits in groups of 2 *)
66+
val n1 = (n andb m1) + ((n >> 1) andb m1)
67+
68+
(* Count bits in groups of 4 *)
69+
val n2 = (n1 andb m2) + ((n1 >> 2) andb m2)
70+
71+
(* Count bits in groups of 8 *)
72+
val n3 = (n2 andb m4) + ((n2 >> 4) andb m4)
73+
74+
(* Sum all bytes *)
75+
val result = ((n3 * m8) >> 24) andb 255
76+
in result
5377
end
5478

5579
fun positionInBitmap bitmap pos =
@@ -66,15 +90,34 @@ let
6690
| (n, x::xs) => arrayGet xs (n - 1)
6791
| _ => {} (* Return empty record for out of bounds *)
6892

93+
(* Optimized arraySet - use accumulator to avoid multiple passes *)
6994
fun arraySet arr idx value =
70-
mapi (fn (i, x) => if i = idx then value else x) arr
95+
let fun set i [] acc = reverse acc
96+
| set i (x::xs) acc =
97+
if i = 0
98+
then append (reverse acc) (value :: xs)
99+
else set (i-1) xs (x::acc)
100+
in set idx arr []
101+
end
71102

103+
(* Optimized arrayInsert *)
72104
fun arrayInsert arr idx value =
73-
insert_at_index arr [value] idx
105+
let fun ins i [] acc = reverse (value :: acc)
106+
| ins i (x::xs) acc =
107+
if i = 0
108+
then append (reverse acc) (value :: x :: xs)
109+
else ins (i-1) xs (x::acc)
110+
in ins idx arr []
111+
end
74112

113+
(* Optimized arrayRemove *)
75114
fun arrayRemove arr idx =
76-
let val len = length arr
77-
in append (slice 0 idx arr) (slice (idx + 1) len arr)
115+
let fun rem i [] acc = reverse acc
116+
| rem i (x::xs) acc =
117+
if i = 0
118+
then append (reverse acc) xs
119+
else rem (i-1) xs (x::acc)
120+
in rem idx arr []
78121
end
79122

80123
(* Node types represented as tagged records *)
@@ -95,7 +138,9 @@ let
95138
| {tag = "leaf", key = k, value = v, hash = oldHash, ..} =>
96139
if h = oldHash then
97140
if key = k then
98-
{tag = "leaf", key = key, value = value, hash = h}
141+
(* Optimization: reuse node if value unchanged *)
142+
if v = value then node
143+
else {tag = "leaf", key = key, value = value, hash = h}
99144
else
100145
{tag = "collision", hash = h, entries = [(k, v), (key, value)]}
101146
else
@@ -134,7 +179,16 @@ let
134179
let val newEntries =
135180
case lookup entries key NONE of
136181
NONE => append entries [(key, value)]
137-
| _ => map (fn (k, v) => if k = key then (key, value) else (k, v)) entries
182+
| _ =>
183+
(* Check if value actually changes *)
184+
let fun updateEntry [] = []
185+
| updateEntry ((k', v')::es) =
186+
if k' = key then
187+
if v' = value then entries (* No change, return original *)
188+
else (key, value) :: es
189+
else (k', v') :: updateEntry es
190+
in updateEntry entries
191+
end
138192
in {tag = "collision", hash = collHash, entries = newEntries}
139193
end
140194
else
@@ -166,6 +220,7 @@ let
166220
else NONE
167221
end
168222
| {tag = "collision", hash = _, entries = entries, ..} =>
223+
(* Optimized collision lookup with early termination *)
169224
let fun findInEntries [] = NONE
170225
| findInEntries ((k, v)::es) =
171226
if k = key then (SOME, v) else findInEntries es
@@ -266,8 +321,25 @@ let
266321
in collectPairs trie []
267322
end
268323

269-
(* Get the size (number of entries) *)
270-
fun size trie = length (keys trie)
324+
(* Optimized size calculation - O(n) but with early termination for common cases *)
325+
fun size trie =
326+
case trie of
327+
{} => 0
328+
| {tag = "leaf", ..} => 1
329+
| _ =>
330+
(* For backward compatibility, we still need to traverse *)
331+
(* but we can optimize by counting directly instead of building key list *)
332+
let fun countEntries node =
333+
case node of
334+
{} => 0
335+
| {tag = "leaf", ..} => 1
336+
| {tag = "bitmap", children = ch, ..} =>
337+
foldl (fn (child, acc) => acc + countEntries child) 0 ch
338+
| {tag = "collision", entries = entries, ..} =>
339+
length entries
340+
| _ => 0
341+
in countEntries trie
342+
end
271343

272344
(* Create a trie from a list of key-value pairs *)
273345
fun fromList pairs =
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Bitmap Position Calculation Analysis
2+
3+
## Current Implementation
4+
5+
The HAMT currently uses this approach to find array positions:
6+
7+
```sml
8+
fun positionInBitmap bitmap pos =
9+
(* Count the number of 1 bits before position pos *)
10+
let val mask = (1 << pos) - 1
11+
val masked = bitmap andb mask
12+
in popcount masked
13+
end
14+
```
15+
16+
This counts how many bits are set in positions 0 through (pos-1), which tells us the index in the children array.
17+
18+
## Benchmark Results
19+
20+
| Method | Time (10k iterations) | Relative Speed |
21+
|--------|----------------------|----------------|
22+
| Current (optimized popcount) | 233ms | 1.0x (baseline) |
23+
| Table lookup | 15,517ms | 66.6x slower |
24+
| Sparse bitmap iteration | 1,980ms | 8.5x slower |
25+
26+
The current implementation is actually the fastest!
27+
28+
## Why Current Implementation is Good
29+
30+
1. **Parallel bit counting is very efficient** - Our optimized popcount processes bits in groups
31+
2. **Single pass operation** - No loops or iterations
32+
3. **Fixed time complexity** - Always O(1) with respect to bitmap size
33+
34+
## Alternative Approaches Analyzed
35+
36+
### 1. Table Lookup
37+
- **Idea**: Pre-compute popcount for 8-bit chunks
38+
- **Problem**: List access in Troupe is O(n), making table lookup extremely slow
39+
- **Would work with**: Native array access
40+
41+
### 2. Sparse Bitmap Iteration
42+
- **Idea**: When few bits are set, iterate through them using count trailing zeros
43+
- **Problem**: Requires multiple operations per bit (isolate, count, clear)
44+
- **Would work for**: Very sparse bitmaps (< 4 bits set)
45+
46+
### 3. Count Trailing Zeros (CTZ)
47+
We successfully implemented CTZ using bit twiddling:
48+
```sml
49+
fun ctz n =
50+
if n = 0 then 32
51+
else
52+
let val isolated = n andb (0 - n) (* Isolate lowest set bit *)
53+
val pos = popcount (isolated - 1)
54+
in pos
55+
end
56+
```
57+
58+
This works but doesn't help with position calculation since we still need popcount.
59+
60+
## Potential Optimizations
61+
62+
### 1. Hybrid Approach for Sparse Bitmaps
63+
```sml
64+
fun positionInBitmap bitmap pos =
65+
let val mask = (1 << pos) - 1
66+
val masked = bitmap andb mask
67+
val bitCount = popcount masked
68+
in
69+
(* Use sparse iteration only for very sparse bitmaps *)
70+
if masked <> 0 andalso bitCount <= 3 then
71+
(* Count bits by iteration - might be faster for 1-3 bits *)
72+
let fun count bm acc =
73+
if bm = 0 then acc
74+
else count (bm andb (bm - 1)) (acc + 1)
75+
in count masked 0
76+
end
77+
else bitCount
78+
end
79+
```
80+
81+
### 2. Caching Position Calculations
82+
Since bitmap nodes are immutable, we could cache position calculations:
83+
```sml
84+
(* In bitmap node *)
85+
{tag = "bitmap", bitmap = bm, children = ch, posCache = ref []}
86+
87+
(* Cache last N position lookups *)
88+
fun cachedPosition node pos = ...
89+
```
90+
91+
### 3. Two-Level Bitmap
92+
For very large tries, use a two-level bitmap structure:
93+
- First level: 8 bits indicating which 4-bit groups have children
94+
- Second level: 4-bit groups with actual child positions
95+
96+
## Conclusion
97+
98+
**The current implementation is already near-optimal for the available operations.**
99+
100+
Key findings:
101+
1. Our parallel popcount is very fast (233ms for 80k operations = ~3.4M ops/sec)
102+
2. Without native arrays, table lookups are prohibitively slow
103+
3. Bit manipulation tricks don't help much when we already have fast popcount
104+
105+
The only significant improvements would come from:
106+
1. **Hardware popcount instruction** - Would be ~10x faster
107+
2. **Native arrays** - Would enable fast table lookups
108+
3. **SIMD operations** - Could process multiple positions in parallel
109+
110+
For now, the current implementation represents the best balance of simplicity and performance given Troupe's constraints.

0 commit comments

Comments
 (0)