Skip to content

Commit 9ab444e

Browse files
committed
Merge branch 'tc/last-modified-active-paths-optimization' into next
"git last-modified" was optimized by narrowing the set of paths to follow as it dug deeper in the history. * tc/last-modified-active-paths-optimization: last-modified: implement faster algorithm
2 parents 1eb3440 + 2a04e8c commit 9ab444e

File tree

3 files changed

+237
-16
lines changed

3 files changed

+237
-16
lines changed

builtin/last-modified.c

Lines changed: 235 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,32 @@
22
#include "bloom.h"
33
#include "builtin.h"
44
#include "commit-graph.h"
5+
#include "commit-slab.h"
56
#include "commit.h"
67
#include "config.h"
7-
#include "environment.h"
88
#include "diff.h"
99
#include "diffcore.h"
1010
#include "environment.h"
11+
#include "ewah/ewok.h"
1112
#include "hashmap.h"
1213
#include "hex.h"
13-
#include "log-tree.h"
1414
#include "object-name.h"
1515
#include "object.h"
1616
#include "parse-options.h"
17+
#include "prio-queue.h"
1718
#include "quote.h"
1819
#include "repository.h"
1920
#include "revision.h"
2021

22+
/* Remember to update object flag allocation in object.h */
23+
#define PARENT1 (1u<<16) /* used instead of SEEN */
24+
#define PARENT2 (1u<<17) /* used instead of BOTTOM, BOUNDARY */
25+
2126
struct last_modified_entry {
2227
struct hashmap_entry hashent;
2328
struct object_id oid;
2429
struct bloom_key key;
30+
size_t diff_idx;
2531
const char path[FLEX_ARRAY];
2632
};
2733

@@ -37,13 +43,45 @@ static int last_modified_entry_hashcmp(const void *unused UNUSED,
3743
return strcmp(ent1->path, path ? path : ent2->path);
3844
}
3945

46+
/*
47+
* Hold a bitmap for each commit we're working with. In the bitmap, each bit
48+
* represents a path in `lm->all_paths`. An active bit indicates the path still
49+
* needs to be associated to a commit.
50+
*/
51+
define_commit_slab(active_paths_for_commit, struct bitmap *);
52+
4053
struct last_modified {
4154
struct hashmap paths;
4255
struct rev_info rev;
4356
bool recursive;
4457
bool show_trees;
58+
59+
const char **all_paths;
60+
size_t all_paths_nr;
61+
struct active_paths_for_commit active_paths;
62+
63+
/* 'scratch' to avoid allocating a bitmap every process_parent() */
64+
struct bitmap *scratch;
4565
};
4666

67+
static struct bitmap *active_paths_for(struct last_modified *lm, struct commit *c)
68+
{
69+
struct bitmap **bitmap = active_paths_for_commit_at(&lm->active_paths, c);
70+
if (!*bitmap)
71+
*bitmap = bitmap_word_alloc(lm->all_paths_nr / BITS_IN_EWORD + 1);
72+
73+
return *bitmap;
74+
}
75+
76+
static void active_paths_free(struct last_modified *lm, struct commit *c)
77+
{
78+
struct bitmap **bitmap = active_paths_for_commit_at(&lm->active_paths, c);
79+
if (*bitmap) {
80+
bitmap_free(*bitmap);
81+
*bitmap = NULL;
82+
}
83+
}
84+
4785
static void last_modified_release(struct last_modified *lm)
4886
{
4987
struct hashmap_iter iter;
@@ -54,6 +92,8 @@ static void last_modified_release(struct last_modified *lm)
5492

5593
hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent);
5694
release_revisions(&lm->rev);
95+
96+
free(lm->all_paths);
5797
}
5898

5999
struct last_modified_callback_data {
@@ -146,7 +186,7 @@ static void mark_path(const char *path, const struct object_id *oid,
146186
* Is it arriving at a version of interest, or is it from a side branch
147187
* which did not contribute to the final state?
148188
*/
149-
if (!oideq(oid, &ent->oid))
189+
if (oid && !oideq(oid, &ent->oid))
150190
return;
151191

152192
last_modified_emit(data->lm, path, data->commit);
@@ -196,7 +236,17 @@ static void last_modified_diff(struct diff_queue_struct *q,
196236
}
197237
}
198238

199-
static bool maybe_changed_path(struct last_modified *lm, struct commit *origin)
239+
static void pass_to_parent(struct bitmap *c,
240+
struct bitmap *p,
241+
size_t pos)
242+
{
243+
bitmap_unset(c, pos);
244+
bitmap_set(p, pos);
245+
}
246+
247+
static bool maybe_changed_path(struct last_modified *lm,
248+
struct commit *origin,
249+
struct bitmap *active)
200250
{
201251
struct bloom_filter *filter;
202252
struct last_modified_entry *ent;
@@ -213,49 +263,212 @@ static bool maybe_changed_path(struct last_modified *lm, struct commit *origin)
213263
return true;
214264

215265
hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) {
266+
if (active && !bitmap_get(active, ent->diff_idx))
267+
continue;
268+
216269
if (bloom_filter_contains(filter, &ent->key,
217270
lm->rev.bloom_filter_settings))
218271
return true;
219272
}
220273
return false;
221274
}
222275

276+
static void process_parent(struct last_modified *lm,
277+
struct prio_queue *queue,
278+
struct commit *c, struct bitmap *active_c,
279+
struct commit *parent, int parent_i)
280+
{
281+
struct bitmap *active_p;
282+
283+
repo_parse_commit(lm->rev.repo, parent);
284+
active_p = active_paths_for(lm, parent);
285+
286+
/*
287+
* The first time entering this function for this commit (i.e. first parent)
288+
* see if Bloom filters will tell us it's worth to do the diff.
289+
*/
290+
if (parent_i || maybe_changed_path(lm, c, active_c)) {
291+
diff_tree_oid(&parent->object.oid,
292+
&c->object.oid, "", &lm->rev.diffopt);
293+
diffcore_std(&lm->rev.diffopt);
294+
}
295+
296+
/*
297+
* Test each path for TREESAME-ness against the parent. If a path is
298+
* TREESAME, pass it on to this parent.
299+
*
300+
* First, collect all paths that are *not* TREESAME in 'scratch'.
301+
* Then, pass paths that *are* TREESAME and active to the parent.
302+
*/
303+
for (int i = 0; i < diff_queued_diff.nr; i++) {
304+
struct diff_filepair *fp = diff_queued_diff.queue[i];
305+
const char *path = fp->two->path;
306+
struct last_modified_entry *ent =
307+
hashmap_get_entry_from_hash(&lm->paths, strhash(path), path,
308+
struct last_modified_entry, hashent);
309+
if (ent) {
310+
size_t k = ent->diff_idx;
311+
if (bitmap_get(active_c, k))
312+
bitmap_set(lm->scratch, k);
313+
}
314+
}
315+
for (size_t i = 0; i < lm->all_paths_nr; i++) {
316+
if (bitmap_get(active_c, i) && !bitmap_get(lm->scratch, i))
317+
pass_to_parent(active_c, active_p, i);
318+
}
319+
320+
/*
321+
* If parent has any active paths, put it on the queue (if not already).
322+
*/
323+
if (!bitmap_is_empty(active_p) && !(parent->object.flags & PARENT1)) {
324+
parent->object.flags |= PARENT1;
325+
prio_queue_put(queue, parent);
326+
}
327+
if (!(parent->object.flags & PARENT1))
328+
active_paths_free(lm, parent);
329+
330+
memset(lm->scratch->words, 0x0, lm->scratch->word_alloc);
331+
diff_queue_clear(&diff_queued_diff);
332+
}
333+
223334
static int last_modified_run(struct last_modified *lm)
224335
{
336+
int max_count, queue_popped = 0;
337+
struct prio_queue queue = { compare_commits_by_gen_then_commit_date };
338+
struct prio_queue not_queue = { compare_commits_by_gen_then_commit_date };
339+
struct commit_list *list;
225340
struct last_modified_callback_data data = { .lm = lm };
226341

227342
lm->rev.diffopt.output_format = DIFF_FORMAT_CALLBACK;
228343
lm->rev.diffopt.format_callback = last_modified_diff;
229344
lm->rev.diffopt.format_callback_data = &data;
345+
lm->rev.no_walk = 1;
230346

231347
prepare_revision_walk(&lm->rev);
232348

233-
while (hashmap_get_size(&lm->paths)) {
234-
data.commit = get_revision(&lm->rev);
235-
if (!data.commit)
236-
BUG("paths remaining beyond boundary in last-modified");
349+
max_count = lm->rev.max_count;
350+
351+
init_active_paths_for_commit(&lm->active_paths);
352+
lm->scratch = bitmap_word_alloc(lm->all_paths_nr);
353+
354+
/*
355+
* lm->rev.commits holds the set of boundary commits for our walk.
356+
*
357+
* Loop through each such commit, and place it in the appropriate queue.
358+
*/
359+
for (list = lm->rev.commits; list; list = list->next) {
360+
struct commit *c = list->item;
361+
362+
if (c->object.flags & BOTTOM) {
363+
prio_queue_put(&not_queue, c);
364+
c->object.flags |= PARENT2;
365+
} else if (!(c->object.flags & PARENT1)) {
366+
/*
367+
* If the commit is a starting point (and hasn't been
368+
* seen yet), then initialize the set of interesting
369+
* paths, too.
370+
*/
371+
struct bitmap *active;
372+
373+
prio_queue_put(&queue, c);
374+
c->object.flags |= PARENT1;
375+
376+
active = active_paths_for(lm, c);
377+
for (size_t i = 0; i < lm->all_paths_nr; i++)
378+
bitmap_set(active, i);
379+
}
380+
}
237381

238-
if (data.commit->object.flags & BOUNDARY) {
382+
while (queue.nr) {
383+
int parent_i;
384+
struct commit_list *p;
385+
struct commit *c = prio_queue_get(&queue);
386+
struct bitmap *active_c = active_paths_for(lm, c);
387+
388+
if ((0 <= max_count && max_count < ++queue_popped) ||
389+
(c->object.flags & PARENT2)) {
390+
/*
391+
* Either a boundary commit, or we have already seen too
392+
* many others. Either way, stop here.
393+
*/
394+
c->object.flags |= PARENT2 | BOUNDARY;
395+
data.commit = c;
239396
diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
240-
&data.commit->object.oid, "",
241-
&lm->rev.diffopt);
397+
&c->object.oid,
398+
"", &lm->rev.diffopt);
242399
diff_flush(&lm->rev.diffopt);
400+
goto cleanup;
401+
}
243402

244-
break;
403+
/*
404+
* Otherwise, make sure that 'c' isn't reachable from anything
405+
* in the '--not' queue.
406+
*/
407+
repo_parse_commit(lm->rev.repo, c);
408+
409+
while (not_queue.nr) {
410+
struct commit_list *np;
411+
struct commit *n = prio_queue_get(&not_queue);
412+
413+
repo_parse_commit(lm->rev.repo, n);
414+
415+
for (np = n->parents; np; np = np->next) {
416+
if (!(np->item->object.flags & PARENT2)) {
417+
prio_queue_put(&not_queue, np->item);
418+
np->item->object.flags |= PARENT2;
419+
}
420+
}
421+
422+
if (commit_graph_generation(n) < commit_graph_generation(c))
423+
break;
245424
}
246425

247-
if (!maybe_changed_path(lm, data.commit))
248-
continue;
426+
/*
427+
* Look at each parent and pass on each path that's TREESAME
428+
* with that parent. Stop early when no active paths remain.
429+
*/
430+
for (p = c->parents, parent_i = 0; p; p = p->next, parent_i++) {
431+
process_parent(lm, &queue,
432+
c, active_c,
433+
p->item, parent_i);
434+
435+
if (bitmap_is_empty(active_c))
436+
break;
437+
}
438+
439+
/*
440+
* Paths that remain active, or not TREESAME with any parent,
441+
* were changed by 'c'.
442+
*/
443+
if (!bitmap_is_empty(active_c)) {
444+
data.commit = c;
445+
for (size_t i = 0; i < lm->all_paths_nr; i++) {
446+
if (bitmap_get(active_c, i))
447+
mark_path(lm->all_paths[i], NULL, &data);
448+
}
449+
}
249450

250-
log_tree_commit(&lm->rev, data.commit);
451+
cleanup:
452+
active_paths_free(lm, c);
251453
}
252454

455+
if (hashmap_get_size(&lm->paths))
456+
BUG("paths remaining beyond boundary in last-modified");
457+
458+
clear_prio_queue(&not_queue);
459+
clear_prio_queue(&queue);
460+
clear_active_paths_for_commit(&lm->active_paths);
461+
bitmap_free(lm->scratch);
462+
253463
return 0;
254464
}
255465

256466
static int last_modified_init(struct last_modified *lm, struct repository *r,
257467
const char *prefix, int argc, const char **argv)
258468
{
469+
struct hashmap_iter iter;
470+
struct last_modified_entry *ent;
471+
259472
hashmap_init(&lm->paths, last_modified_entry_hashcmp, NULL, 0);
260473

261474
repo_init_revisions(r, &lm->rev, prefix);
@@ -280,6 +493,13 @@ static int last_modified_init(struct last_modified *lm, struct repository *r,
280493
if (populate_paths_from_revs(lm) < 0)
281494
return error(_("unable to setup last-modified"));
282495

496+
CALLOC_ARRAY(lm->all_paths, hashmap_get_size(&lm->paths));
497+
lm->all_paths_nr = 0;
498+
hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) {
499+
ent->diff_idx = lm->all_paths_nr++;
500+
lm->all_paths[ent->diff_idx] = ent->path;
501+
}
502+
283503
return 0;
284504
}
285505

object.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ void object_array_init(struct object_array *array);
7575
* http-push.c: 11-----14
7676
* commit-graph.c: 15
7777
* commit-reach.c: 16-----19
78+
* builtin/last-modified.c: 1617
7879
* sha1-name.c: 20
7980
* list-objects-filter.c: 21
8081
* bloom.c: 2122

t/t8020-last-modified.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ test_expect_success 'last-modified recursive' '
5757

5858
test_expect_success 'last-modified recursive with show-trees' '
5959
check_last_modified -r -t <<-\EOF
60-
3 a
6160
3 a/b
6261
3 a/b/file
62+
3 a
6363
2 a/file
6464
1 file
6565
EOF

0 commit comments

Comments
 (0)