Skip to content

Commit 3b24175

Browse files
ezekielnewrengitster
authored andcommitted
xdiff: split xrecord_t.ha into line_hash and minimal_perfect_hash
The ha field is serving two different purposes, which makes the code harder to read. At first glance it looks like many places assume there could never be hash collisions between lines of the two input files. In reality, line_hash is used together with xdl_recmatch() to ensure correct comparisons of lines, even when collisions occur. To make this clearer, the old ha field has been split: * line_hash: The straightforward hash of a line, requiring no additional context. * minimal_perfect_hash: Not a new concept, but now a separate field. It comes from the classifier's general-purpose hash table, which assigns each line a unique and minimal hash across the two files. Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent b1feb22 commit 3b24175

File tree

5 files changed

+20
-19
lines changed

5 files changed

+20
-19
lines changed

xdiff/xdiffi.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222

2323
#include "xinclude.h"
2424

25-
static unsigned long get_hash(xdfile_t *xdf, long index)
25+
static size_t get_hash(xdfile_t *xdf, long index)
2626
{
27-
return xdf->recs[xdf->rindex[index]].ha;
27+
return xdf->recs[xdf->rindex[index]].minimal_perfect_hash;
2828
}
2929

3030
#define XDL_MAX_COST_MIN 256
@@ -385,7 +385,7 @@ static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1,
385385

386386
static int recs_match(xrecord_t *rec1, xrecord_t *rec2)
387387
{
388-
return (rec1->ha == rec2->ha);
388+
return rec1->minimal_perfect_hash == rec2->minimal_perfect_hash;
389389
}
390390

391391
/*

xdiff/xhistogram.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,15 @@ struct region {
9090

9191
static int cmp_recs(xrecord_t *r1, xrecord_t *r2)
9292
{
93-
return r1->ha == r2->ha;
93+
return r1->minimal_perfect_hash == r2->minimal_perfect_hash;
9494

9595
}
9696

9797
#define CMP(i, s1, l1, s2, l2) \
9898
(cmp_recs(REC(i->env, s1, l1), REC(i->env, s2, l2)))
9999

100100
#define TABLE_HASH(index, side, line) \
101-
XDL_HASHLONG((REC(index->env, side, line))->ha, index->table_bits)
101+
XDL_HASHLONG((REC(index->env, side, line))->minimal_perfect_hash, index->table_bits)
102102

103103
static int scanA(struct histindex *index, int line1, int count1)
104104
{

xdiff/xpatience.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
struct hashmap {
4949
int nr, alloc;
5050
struct entry {
51-
unsigned long hash;
51+
size_t minimal_perfect_hash;
5252
/*
5353
* 0 = unused entry, 1 = first line, 2 = second, etc.
5454
* line2 is NON_UNIQUE if the line is not unique
@@ -101,10 +101,10 @@ static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map,
101101
* So we multiply ha by 2 in the hope that the hashing was
102102
* "unique enough".
103103
*/
104-
int index = (int)((record->ha << 1) % map->alloc);
104+
int index = (int)((record->minimal_perfect_hash << 1) % map->alloc);
105105

106106
while (map->entries[index].line1) {
107-
if (map->entries[index].hash != record->ha) {
107+
if (map->entries[index].minimal_perfect_hash != record->minimal_perfect_hash) {
108108
if (++index >= map->alloc)
109109
index = 0;
110110
continue;
@@ -120,7 +120,7 @@ static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map,
120120
if (pass == 2)
121121
return;
122122
map->entries[index].line1 = line;
123-
map->entries[index].hash = record->ha;
123+
map->entries[index].minimal_perfect_hash = record->minimal_perfect_hash;
124124
map->entries[index].anchor = is_anchor(xpp, (const char *)map->env->xdf1.recs[line - 1].ptr);
125125
if (!map->first)
126126
map->first = map->entries + index;
@@ -248,7 +248,7 @@ static int match(struct hashmap *map, int line1, int line2)
248248
{
249249
xrecord_t *record1 = &map->env->xdf1.recs[line1 - 1];
250250
xrecord_t *record2 = &map->env->xdf2.recs[line2 - 1];
251-
return record1->ha == record2->ha;
251+
return record1->minimal_perfect_hash == record2->minimal_perfect_hash;
252252
}
253253

254254
static int patience_diff(xpparam_t const *xpp, xdfenv_t *env,

xdiff/xprepare.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
9696
long hi;
9797
xdlclass_t *rcrec;
9898

99-
hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
99+
hi = (long) XDL_HASHLONG(rec->line_hash, cf->hbits);
100100
for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
101-
if (rcrec->rec.ha == rec->ha &&
101+
if (rcrec->rec.line_hash == rec->line_hash &&
102102
xdl_recmatch((const char *)rcrec->rec.ptr, (long)rcrec->rec.size,
103103
(const char *)rec->ptr, (long)rec->size, cf->flags))
104104
break;
@@ -120,7 +120,7 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
120120

121121
(pass == 1) ? rcrec->len1++ : rcrec->len2++;
122122

123-
rec->ha = (unsigned long) rcrec->idx;
123+
rec->minimal_perfect_hash = (size_t)rcrec->idx;
124124

125125
return 0;
126126
}
@@ -158,7 +158,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
158158
crec = &xdf->recs[xdf->nrec++];
159159
crec->ptr = prev;
160160
crec->size = cur - prev;
161-
crec->ha = hav;
161+
crec->line_hash = hav;
162162
if (xdl_classify_record(pass, cf, crec) < 0)
163163
goto abort;
164164
}
@@ -290,15 +290,15 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
290290
if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
291291
mlim = XDL_MAX_EQLIMIT;
292292
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
293-
rcrec = cf->rcrecs[recs->ha];
293+
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
294294
nm = rcrec ? rcrec->len2 : 0;
295295
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
296296
}
297297

298298
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
299299
mlim = XDL_MAX_EQLIMIT;
300300
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
301-
rcrec = cf->rcrecs[recs->ha];
301+
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
302302
nm = rcrec ? rcrec->len1 : 0;
303303
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
304304
}
@@ -350,15 +350,15 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
350350
recs2 = xdf2->recs;
351351
for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
352352
i++, recs1++, recs2++)
353-
if (recs1->ha != recs2->ha)
353+
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
354354
break;
355355

356356
xdf1->dstart = xdf2->dstart = i;
357357

358358
recs1 = xdf1->recs + xdf1->nrec - 1;
359359
recs2 = xdf2->recs + xdf2->nrec - 1;
360360
for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
361-
if (recs1->ha != recs2->ha)
361+
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
362362
break;
363363

364364
xdf1->dend = xdf1->nrec - i - 1;

xdiff/xtypes.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ typedef struct s_chastore {
4141
typedef struct s_xrecord {
4242
uint8_t const *ptr;
4343
size_t size;
44-
unsigned long ha;
44+
uint64_t line_hash;
45+
size_t minimal_perfect_hash;
4546
} xrecord_t;
4647

4748
typedef struct s_xdfile {

0 commit comments

Comments
 (0)