From eb7f0e924d87153823e4672a4f008fd80939b693 Mon Sep 17 00:00:00 2001 From: Tim Eliseo <667606+teliseo@users.noreply.github.com> Date: Fri, 31 Jan 2025 01:01:36 -0800 Subject: [PATCH 1/5] Fuzzy match deltas for file creation with default date-variant comments --- git-sccsimport.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) mode change 100644 => 100755 git-sccsimport.py diff --git a/git-sccsimport.py b/git-sccsimport.py old mode 100644 new mode 100755 index 5929e89..901aafe --- a/git-sccsimport.py +++ b/git-sccsimport.py @@ -474,7 +474,18 @@ def FetchDeltaProperties(self): self._seqno = int(self._seqno) self._parent_seqno = int(self._parent_seqno) if self._comment == "\n": - self._comment = None + self._comment = self._cmp_comment = None + elif self._parent_seqno == 0: # Only for file creation delta + # Remove the variable part (the date/time) from a default file + # creation comment when used for fuzzy comparison, + # and replace with SCCS_ESCAPE as a sentinal unlikely to appear in + # real comments. This allows consecutive creations to be folded into + # one git commit in the same way as other comments. + self._cmp_comment = re.sub( + r"^date and time created \d\d/\d\d/\d\d \d\d:\d\d:\d\d by ", + '\x01', self._comment, flags=re.ASCII) + else: + self._cmp_comment = self._comment assert sidcheck==self._sid self._mrs = mrlist.split() @@ -483,18 +494,13 @@ def FetchDeltaProperties(self): def SameFuzzyCommit(self, other): #print(("SameFuzzyCommit: comparing\n1: %s with\n2: %s" # % (self, other)), file=sys.stderr) - if self._comment != other._comment: - return False - elif self._committer != other._committer: - return False - elif self._mrs != other._mrs: - return False - else: - delta = abs(other._timestamp - self._timestamp) - if delta > FUZZY_WINDOW or self._comment == "": - return False - else: - return True + return ( + self._comment != "" and + self._cmp_comment == other._cmp_comment and + self._committer == other._committer and + self._mrs == other._mrs and + abs(other._timestamp - self._timestamp) <= FUZZY_WINDOW + ) def SetTimestamp(self, checkin_date, checkin_time): try: From 48223d196dd585c0c86a550015c2d3b44ca3d3bc Mon Sep 17 00:00:00 2001 From: Tim Eliseo <667606+teliseo@users.noreply.github.com> Date: Fri, 31 Jan 2025 22:11:55 -0800 Subject: [PATCH 2/5] Add --no-combine-create option to disable fuzzy matching of varying create comments --- git-sccsimport.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/git-sccsimport.py b/git-sccsimport.py index 901aafe..4f0fd8b 100755 --- a/git-sccsimport.py +++ b/git-sccsimport.py @@ -158,6 +158,7 @@ verbose = False DoTags = True +DoCombineCreate = True class ImportFailure(Exception): pass @@ -475,7 +476,7 @@ def FetchDeltaProperties(self): self._parent_seqno = int(self._parent_seqno) if self._comment == "\n": self._comment = self._cmp_comment = None - elif self._parent_seqno == 0: # Only for file creation delta + elif DoCombineCreate and self._parent_seqno == 0: # Only for file creation delta # Remove the variable part (the date/time) from a default file # creation comment when used for fuzzy comparison, # and replace with SCCS_ESCAPE as a sentinal unlikely to appear in @@ -1087,6 +1088,7 @@ def ParseOptions(argv): global MoveDate global MoveOffset global DoTags + global DoCombineCreate global verbose global AuthorMap @@ -1125,6 +1127,8 @@ def ParseOptions(argv): parser.add_option("--move-offset", help=("set the number of hours between timezones for" " --move-date, old to new")) + parser.add_option("--no-combine-create", default=False, action="store_true", + help="Don't combine file create deltas with date-divergent comments.") parser.add_option("--no-tags", default=False, action="store_true", help="Don't try to create tags on SID level bumps.") parser.add_option("--stdout", default=False, action="store_true", @@ -1174,12 +1178,16 @@ def ParseOptions(argv): if options.authormap: AuthorMap = GetAuthorMap(options.authormap) + if options.no_combine_create: + DoCombineCreate = False + try: FUZZY_WINDOW = float(options.fuzzy_commit_window) except ValueError: raise UsageError("The argument for the --fuzzy-commit-window option " "should be a number, but you specified '%s'" % (options.fuzzy_commit_window,)) + IMPORT_REF = "refs/heads/%s" % (options.branch,) return options, args From b82fc58f07eb9f527b2c12ccc5b16212a5e723ee Mon Sep 17 00:00:00 2001 From: Tim Eliseo <667606+teliseo@users.noreply.github.com> Date: Fri, 31 Jan 2025 22:14:13 -0800 Subject: [PATCH 3/5] Implement fuzzy merging of non-time-contiguous deltas into one commit - By default, this non-contiguous merging is enabled, but may be disabled using the --no-combine-separate option, restoring previous behavior. - By default, the git commit gets the date of the latest SCCS delta, but this may be disabled using the --commit-date-earliest option, restoring previous behavior. - Fix capitalization of help texts. This implementation has one flaw: With non-contiguous merging and latest commit date both enabled (the default), git commit timestamps may be non-monotonic. --- git-sccsimport.py | 91 +++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/git-sccsimport.py b/git-sccsimport.py index 4f0fd8b..2e9966b 100755 --- a/git-sccsimport.py +++ b/git-sccsimport.py @@ -48,10 +48,6 @@ # - perhaps we could just commit branch deltas to "br--R.L.B" refs # (which are created with a "reset" section)? # -# - fuzzy commit comparison would work better if delta sorting was smarter, -# e.g. if it could do some kind of "sliding window sort" on the comment text -# over a group of commits. -# # - can the SCCS "descriptive text" (:FD:) be used and useful in Git? (store it # in a special attribute?) # @@ -149,7 +145,7 @@ # Two checkins separated by more than FUZZY_WINDOW will never be considered part # of the same commit; N.B. even if they have the same non-empty comment, -# commiter, and MRs. (I.e. this can be a relatively large number, e.g. 1 day, +# committer, and MRs. (I.e. this can be a relatively large number, e.g. 1 day, # or even potentially a much longer time, such as a week.) FUZZY_WINDOW = 24.0 * 60.0 * 60.0 * 7.0 @@ -159,6 +155,8 @@ DoTags = True DoCombineCreate = True +DoCombineSeparate = True +CommitDateEarliest = False class ImportFailure(Exception): pass @@ -499,8 +497,7 @@ def SameFuzzyCommit(self, other): self._comment != "" and self._cmp_comment == other._cmp_comment and self._committer == other._committer and - self._mrs == other._mrs and - abs(other._timestamp - self._timestamp) <= FUZZY_WINDOW + self._mrs == other._mrs ) def SetTimestamp(self, checkin_date, checkin_time): @@ -746,7 +743,7 @@ def Progress(self, done, items): msg = "\r %3.0f%% (%d/%d)%s" % (percent, done, items, tail,) self.ProgressMsg(msg) - def BeginCommit(self, delta, parent): + def BeginCommit(self, delta_first, delta_last, parent): """Start a new commit (having the indicated parent).""" mark = self.GetNextMark() self.Write("commit %s\nmark :%d\n" % (IMPORT_REF, mark,)) @@ -762,14 +759,17 @@ def BeginCommit(self, delta, parent): # --contains a965bb31" tells me this will be v2.21.0 or newer: # if tuple(GitVer.split(".")) >= tuple("2.21.0".split(".")): + # Use sid/seqno of the first delta of the group, not the last, to + # be sure they're monotonically increasing self.Write("original-oid %s-%s-%s\n" - % (delta._sccsfile._filename, delta._sid, delta._seqno)) + % (delta_first._sccsfile._filename, delta_first._sid, delta_first._seqno)) - ts = delta.GitTimestamp() + # If --commit-date-earliest, then delta_last may actually be the same as delta_first + ts = delta_last.GitTimestamp() self.Write("committer %s %s\n" - % (delta._ui.email, ts)) + % (delta_last._ui.email, ts)) - self.WriteData(delta.GitComment()) + self.WriteData(delta_last.GitComment()) if parent: self.Write("from :%d\n" % (parent,)) @@ -815,16 +815,15 @@ def CompleteCommit(self): """Write the final part of a commit.""" self.Write("\n") - -# TODO: if the fuzzy commit logic puts subsequent deltas into the same -# commit, the timestamp of the commit is that of the first delta. -# One could argue that the timestamp of the last one would be a better choice. - +# If the fuzzy commit logic puts subsequent deltas into the same +# commit, the timestamp of the commit is that of the last delta, by default. +# The --commit-date-earliest option instead uses the timestamp of the first +# delta, matching previous behavior of this program. def ImportDeltas(imp, deltas): if not deltas: raise ImportFailure("No deltas to import") - first_delta_in_commit = None + commit_begun = False done = 0 imp.ProgressMsg("\nCreating commits...\n") plevel = None @@ -832,26 +831,46 @@ def ImportDeltas(imp, deltas): pdelta = None commit_count = 0 write_tag_next = False - for d in deltas: + next_divergent_didx = 0 + for didx in range(len(deltas)): imp.Progress(done, len(deltas)) done += 1 - # Figure out if we need to start a new commit. - if first_delta_in_commit: - if not first_delta_in_commit.SameFuzzyCommit(d): + d = deltas[didx] + if didx == next_divergent_didx: + # Look ahead within FUZZY_WINDOW time to find matching deltas, and + # group them together in the list so that they may all be part of + # one git commit. + next_divergent_didx += 1 + for didx_same in range(next_divergent_didx, len(deltas)): + if deltas[didx_same]._timestamp - d._timestamp > FUZZY_WINDOW: + break + if d.SameFuzzyCommit(deltas[didx_same]): + # If necessary, hoist this commit up to be adjacent to + # other commits that pass the fuzzy match + if next_divergent_didx != didx_same: + deltas.insert(next_divergent_didx, deltas.pop(didx_same)) + next_divergent_didx += 1 + elif not DoCombineSeparate: + # If --no-combine-separate is set, the first mismatch ends + # the group + break + + # Figure out if we need to start a new commit. + if commit_begun: imp.CompleteCommit() - first_delta_in_commit = None + commit_begun = False if DoTags and write_tag_next: - imp.WriteTag(plevel, current - 1) + imp.WriteTag(plevel, parent - 1) write_tag_next = False if plevel and d.SidLevel() > plevel.SidLevel() and d.SidRev() == 1: write_tag_next = True - if first_delta_in_commit is None: - first_delta_in_commit = d - current = imp.BeginCommit(d, parent) + if not commit_begun: + commit_begun = True + parent = imp.BeginCommit(d, + d if CommitDateEarliest else deltas[next_divergent_didx - 1], parent) commit_count += 1 - parent = current if pdelta: plevel = d @@ -1089,6 +1108,8 @@ def ParseOptions(argv): global MoveOffset global DoTags global DoCombineCreate + global DoCombineSeparate + global CommitDateEarliest global verbose global AuthorMap @@ -1098,7 +1119,7 @@ def ParseOptions(argv): parser = optparse.OptionParser() parser.add_option("--branch", - help="branch to populate", + help="Branch to populate", default="master") parser.add_option("--maildomain", help="Mail domain for usernames taken from SCCS files") @@ -1106,6 +1127,8 @@ def ParseOptions(argv): help="Default UTC offset for timestamps (default: %s)" % (DEFAULT_USER_TZ,)) parser.add_option("--authormap", help="File mapping author user-IDs to Git style user.{name,email}") + parser.add_option("--commit-date-earliest", default=False, action="store_true", + help="Commits have timestamp of earliest delta, not (default) latest.") parser.add_option("--dirs", action="store_true", help=("Command-line arguments are a list " @@ -1122,13 +1145,15 @@ def ParseOptions(argv): parser.add_option("--init", default=False, action="store_true", help="Initialise the git repository first") parser.add_option("--move-date", - help=("set the date SCCS files moved between timezones" + help=("Set the date SCCS files moved between timezones" " (in ISO8601 form: YYYY/MM/DDTHH:MM:SS)")) parser.add_option("--move-offset", help=("set the number of hours between timezones for" " --move-date, old to new")) parser.add_option("--no-combine-create", default=False, action="store_true", help="Don't combine file create deltas with date-divergent comments.") + parser.add_option("--no-combine-separate", default=False, action="store_true", + help="Don't combine similar deltas not contiguous in time.") parser.add_option("--no-tags", default=False, action="store_true", help="Don't try to create tags on SID level bumps.") parser.add_option("--stdout", default=False, action="store_true", @@ -1181,6 +1206,12 @@ def ParseOptions(argv): if options.no_combine_create: DoCombineCreate = False + if options.no_combine_separate: + DoCombineSeparate = False + + if options.commit_date_earliest: + CommitDateEarliest = True + try: FUZZY_WINDOW = float(options.fuzzy_commit_window) except ValueError: From 72f5b0687ef9f9e4d95c68b3d1b4aec3f2f631a7 Mon Sep 17 00:00:00 2001 From: Tim Eliseo <667606+teliseo@users.noreply.github.com> Date: Sun, 2 Feb 2025 11:50:51 -0800 Subject: [PATCH 4/5] Fix non-contiguous delta merging to keep commit timestamps monotonic - Commits are reordered as necessary to guarantee monotonic commit timestamps even with non-contiguous merging, and using the latest delta timestamp as the commit timestamp (these methods are the current defaults). - Ensure that multiple deltas of the same file can't be combined in one commit. - Change --fuzzy-commit-window option to be an exclusive rather than inclusive bound so that 0 always creates one commit per delta (previously this behavior required the non-intuitive -1 to be specified). --- git-sccsimport.py | 107 +++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 44 deletions(-) diff --git a/git-sccsimport.py b/git-sccsimport.py index 2e9966b..8a089e9 100755 --- a/git-sccsimport.py +++ b/git-sccsimport.py @@ -743,7 +743,7 @@ def Progress(self, done, items): msg = "\r %3.0f%% (%d/%d)%s" % (percent, done, items, tail,) self.ProgressMsg(msg) - def BeginCommit(self, delta_first, delta_last, parent): + def BeginCommit(self, delta, parent): """Start a new commit (having the indicated parent).""" mark = self.GetNextMark() self.Write("commit %s\nmark :%d\n" % (IMPORT_REF, mark,)) @@ -759,17 +759,14 @@ def BeginCommit(self, delta_first, delta_last, parent): # --contains a965bb31" tells me this will be v2.21.0 or newer: # if tuple(GitVer.split(".")) >= tuple("2.21.0".split(".")): - # Use sid/seqno of the first delta of the group, not the last, to - # be sure they're monotonically increasing self.Write("original-oid %s-%s-%s\n" - % (delta_first._sccsfile._filename, delta_first._sid, delta_first._seqno)) + % (delta._sccsfile._filename, delta._sid, delta._seqno)) - # If --commit-date-earliest, then delta_last may actually be the same as delta_first - ts = delta_last.GitTimestamp() + ts = delta.GitTimestamp() self.Write("committer %s %s\n" - % (delta_last._ui.email, ts)) + % (delta._ui.email, ts)) - self.WriteData(delta_last.GitComment()) + self.WriteData(delta.GitComment()) if parent: self.Write("from :%d\n" % (parent,)) @@ -823,60 +820,82 @@ def CompleteCommit(self): def ImportDeltas(imp, deltas): if not deltas: raise ImportFailure("No deltas to import") - commit_begun = False + + # Take a first pass through to group together deltas in the list within + # a FUZZY_WINDOW time that pass the matching tests. Each group then becomes + # a git commit. + first_didx = didx = 0 + end_matching_didx = 1 + while didx < len(deltas): # We can't iterate with for because didx gets moved + # didx is the potential last delta in a group, so we're checking it + # against the one after. First, in_win is whether we've reached the end + # of our time window. + in_win = didx < len(deltas) - 1 and \ + deltas[didx + 1]._timestamp - deltas[first_didx]._timestamp < FUZZY_WINDOW + # Test if the next delta fuzzy-matches the commit group, and also verify + # it isn't from an SCCS file already in this group (sometimes two deltas + # of the same file have exactly the same comment). + if (in_win and deltas[first_didx].SameFuzzyCommit(deltas[didx + 1]) and + not any(d._sccsfile is deltas[didx + 1]._sccsfile + for d in deltas[first_didx:end_matching_didx])): + # If necessary, hoist this commit up to be adjacent to + # other commits that pass the fuzzy match + didx += 1 + if end_matching_didx != didx: + deltas.insert(end_matching_didx, deltas.pop(didx)) + end_matching_didx += 1 # Delta has been added to the group + elif not in_win or not DoCombineSeparate: + # Mark this commit group as finished and reset loop to start new group + for d_grp in deltas[first_didx:end_matching_didx]: + d_grp._last_delta = deltas[end_matching_didx - 1] + deltas[first_didx]._first_marker = True + first_didx = didx = end_matching_didx + end_matching_didx = didx + 1 + else: # in_win and DoCombineSeparate and not fuzzy match + # We're still in the window, but skip this non-matching delta + didx += 1 + + # If deltas got moved to be together to combine them, and we're using the + # last delta date for the git commit, the commit dates could end up + # non-monotonic, so sort again to re-order the commits. Otherwise, this + # isn't necessary (and would be wrong for CommitDateEarliest if combining). + # Python's stable sort guarantees that commits, which might happen to share + # same-date sort keys, won't get mixed or jumbled. + if DoCombineSeparate and not CommitDateEarliest: + deltas.sort(key=attrgetter('_last_delta._timestamp')) + done = 0 imp.ProgressMsg("\nCreating commits...\n") + commit_begun = False plevel = None parent = None pdelta = None commit_count = 0 write_tag_next = False - next_divergent_didx = 0 - for didx in range(len(deltas)): + for d in deltas: imp.Progress(done, len(deltas)) done += 1 - d = deltas[didx] - if didx == next_divergent_didx: - # Look ahead within FUZZY_WINDOW time to find matching deltas, and - # group them together in the list so that they may all be part of - # one git commit. - next_divergent_didx += 1 - for didx_same in range(next_divergent_didx, len(deltas)): - if deltas[didx_same]._timestamp - d._timestamp > FUZZY_WINDOW: - break - if d.SameFuzzyCommit(deltas[didx_same]): - # If necessary, hoist this commit up to be adjacent to - # other commits that pass the fuzzy match - if next_divergent_didx != didx_same: - deltas.insert(next_divergent_didx, deltas.pop(didx_same)) - next_divergent_didx += 1 - elif not DoCombineSeparate: - # If --no-combine-separate is set, the first mismatch ends - # the group - break - - # Figure out if we need to start a new commit. - if commit_begun: - imp.CompleteCommit() - commit_begun = False - if DoTags and write_tag_next: - imp.WriteTag(plevel, parent - 1) - write_tag_next = False + # Figure out if we need to start a new commit + if commit_begun and getattr(d, '_first_marker', False): + imp.CompleteCommit() + commit_begun = False + if DoTags and write_tag_next: + imp.WriteTag(plevel, parent - 1) + write_tag_next = False - if plevel and d.SidLevel() > plevel.SidLevel() and d.SidRev() == 1: - write_tag_next = True + if plevel and d.SidLevel() > plevel.SidLevel() and d.SidRev() == 1: + write_tag_next = True if not commit_begun: commit_begun = True - parent = imp.BeginCommit(d, - d if CommitDateEarliest else deltas[next_divergent_didx - 1], parent) + parent = imp.BeginCommit(d if CommitDateEarliest else d._last_delta, parent) commit_count += 1 if pdelta: plevel = d pdelta = d - # We're now in a commit. Emit the body for this delta. + # We're now in a commit, so emit the body for this delta. body = GetBody(d._sccsfile._filename, d._seqno, EXPAND_KEYWORDS) if len(body) == 0: imp.Filedelete(d._sccsfile) @@ -1138,7 +1157,7 @@ def ParseOptions(argv): help="Expand keywords") parser.add_option("--fuzzy-commit-window", default=FUZZY_WINDOW, - help=("Deltas more than this many seconds apart " + help=("Deltas this many seconds apart or more " "are always considered to be in different commits")) parser.add_option("--git-dir", help="Directory containing the git repository") From 0b88c425b7040a9c12cf8ab57465696747310cf8 Mon Sep 17 00:00:00 2001 From: Tim Eliseo <667606+teliseo@users.noreply.github.com> Date: Sun, 2 Feb 2025 12:45:57 -0800 Subject: [PATCH 5/5] Remove periods from some help texts (mostly mine) for consistency --- git-sccsimport.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/git-sccsimport.py b/git-sccsimport.py index 8a089e9..ca72109 100755 --- a/git-sccsimport.py +++ b/git-sccsimport.py @@ -1147,7 +1147,7 @@ def ParseOptions(argv): parser.add_option("--authormap", help="File mapping author user-IDs to Git style user.{name,email}") parser.add_option("--commit-date-earliest", default=False, action="store_true", - help="Commits have timestamp of earliest delta, not (default) latest.") + help="Commits have timestamp of earliest delta, not (default) latest") parser.add_option("--dirs", action="store_true", help=("Command-line arguments are a list " @@ -1170,11 +1170,11 @@ def ParseOptions(argv): help=("set the number of hours between timezones for" " --move-date, old to new")) parser.add_option("--no-combine-create", default=False, action="store_true", - help="Don't combine file create deltas with date-divergent comments.") + help="Don't combine file create deltas with date-divergent comments") parser.add_option("--no-combine-separate", default=False, action="store_true", - help="Don't combine similar deltas not contiguous in time.") + help="Don't combine similar deltas not contiguous in time") parser.add_option("--no-tags", default=False, action="store_true", - help="Don't try to create tags on SID level bumps.") + help="Don't try to create tags on SID level bumps") parser.add_option("--stdout", default=False, action="store_true", help=("Send git-fast-import data to stdout " "rather than to git-fast-import")) @@ -1182,9 +1182,9 @@ def ParseOptions(argv): help=("Use the 'sccs' front-end for SCCS commands" " (by default need for 'sccs' is auto-detected)")) parser.add_option("--debug", default=False, action="store_true", - help="Print all commands being run and any stderr output.") + help="Print all commands being run and any stderr output") parser.add_option("--verbose", default=False, action="store_true", - help="Print more verbose status messages.") + help="Print more verbose status messages") (options, args) = parser.parse_args(argv)