diff --git a/README.md b/README.md index b143867..8cba9d0 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ A small python script which imports cvs tree into git repository. Pros: - Small footprint - Supports incremental import. It's very fast -- Converts tags on HEAD -- Converts all tags and CVS branches to Git branches with `-A` +- Converts CVS branches and tags with `-A` - Everything is done in memory An alternative to @@ -27,9 +26,8 @@ Prerequisite: Usage ----- - usage: cvs2gitdump [-aAh] [-z fuzz] [-e email_domain] - [-E log_encodings] [-k rcs_keywords] [-b branch] - [-m module] [-l last_revision] + usage: cvs2gitdump [-aAh] [-z fuzz] [-e email_domain] [-E log_encodings] + [-k rcs_keywords] [-b branch] [-m module] [-l last_revision] cvsroot [git_dir] @@ -44,7 +42,9 @@ Usage * -A - Convert all tags and CVS branches. + Convert CVS branches and tags. Branches and tags are reset to generated + commits. No extra commits are synthesized to normalize CVS branch/tag + snapshots or symbol-expanded keyword content. * -b branch @@ -205,4 +205,3 @@ Periodic import: % doas cvsync % python cvs2svndump.py -k OpenBSD /cvs/openbsd/src file:///svnrepo vendor/openbsd/head/src > openbsd2.dump % svnadmin load /svnrepo < openbsd2.dump - diff --git a/classify-verify-output.py b/classify-verify-output.py new file mode 100755 index 0000000..9c2b7df --- /dev/null +++ b/classify-verify-output.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 + +import argparse +import hashlib +import subprocess +from collections import defaultdict +from pathlib import Path +import re + + +HEAD_REF = 'refs/heads/master' +REVISION = re.compile(r'^\d+(?:\.\d+)+$') + + +class Record: + def __init__(self, module, ref, kind, path): + self.module = module + self.ref = ref + self.kind = kind + self.path = path + self.symbol = symbol_from_ref(ref) + + +class ModuleInput: + def __init__(self, module, output_file, git_dir): + self.module = module + self.output_file = output_file + self.git_dir = git_dir + + +class SelectedInfo: + def __init__(self, cls, revision=None, blob=None, error=None): + self.cls = cls + self.revision = revision + self.blob = blob + self.error = error + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Classify verify-cvs-git.py mismatch output.') + parser.add_argument('--details') + parser.add_argument('cvsroot') + parser.add_argument( + 'inputs', nargs='+', + help='module output-file git-dir triples') + args = parser.parse_args() + if len(args.inputs) % 3 != 0: + parser.error('inputs must be module output-file git-dir triples') + return args + + +def symbol_from_ref(ref): + if ref == HEAD_REF: + return None + if ref.startswith('refs/heads/') or ref.startswith('refs/tags/'): + return ref.rsplit('/', 1)[-1] + return None + + +def read_records(module, filename): + records = [] + with open(filename, 'r', encoding='utf-8', errors='surrogateescape') as file: + for line in file: + ref, kind, path = line.rstrip('\n').split(' ', 2) + records.append(Record(module, ref, kind, path)) + return records + + +def rcs_path(cvsroot, module, path): + base = Path(cvsroot) / module + normal = base / (path + ',v') + if normal.exists(): + return normal + + parts = path.split('/') + attic = base.joinpath(*parts[:-1], 'Attic', parts[-1] + ',v') + if attic.exists(): + return attic + + return None + + +def is_branch_symbol_revision(revision): + parts = revision.split('.') + return '.0.' in revision or len(parts) % 2 == 1 + + +def normalize_branch_revision(revision): + parts = revision.split('.') + if '.0.' in revision: + zero = parts.index('0') + return '.'.join(parts[:zero] + parts[zero + 1:]) + return revision + + +def branchpoint_revision(branch): + parts = branch.split('.') + return '.'.join(parts[:-1]) + + +def revision_key(revision): + return tuple(int(part) for part in revision.split('.')) + + +def git_blob_hash(data): + header = ('blob %d\0' % len(data)).encode('ascii') + return hashlib.sha1(header + data).hexdigest() + + +def co_revision(path, revision): + proc = subprocess.run( + ['co', '-q', '-p', '-r' + revision, str(path)], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + raise RuntimeError(proc.stderr.decode('utf-8', 'replace').strip()) + return proc.stdout + + +class RcsInfo: + def __init__(self): + self.symbols = {} + self.states = {} + + +def parse_rcs(path, needed): + info = RcsInfo() + in_symbols = False + revision = None + + with open(path, 'rb') as file: + for raw_line in file: + line = raw_line.decode('latin-1').strip() + if line == 'desc': + break + + if not in_symbols: + if line.startswith('symbols'): + in_symbols = True + line = line[len('symbols'):].strip() + + if in_symbols: + if line.startswith('locks;'): + in_symbols = False + else: + if line.endswith(';'): + line = line[:-1] + for item in line.split(): + if ':' not in item: + continue + name, value = item.split(':', 1) + if name in needed: + info.symbols[name] = value + continue + + if REVISION.match(line): + revision = line + continue + + if revision is None: + continue + + state = re.search(r'\bstate\s+([^;]+);', line) + if state: + info.states[revision] = state.group(1) + + return info + + +def selected_branch_revision(info, branch): + parts = branch.split('.') + candidates = [] + prefix = branch + '.' + for revision in info.states: + revision_parts = revision.split('.') + if revision.startswith(prefix) and len(revision_parts) == len(parts) + 1: + candidates.append(revision) + + if candidates: + return max(candidates, key=revision_key) + + return branchpoint_revision(branch) + + +def selected_revision_info(rcs, info, symbol): + revision = info.symbols.get(symbol) + if revision is None: + return SelectedInfo('no-cvs-symbol') + + if is_branch_symbol_revision(revision): + branch = normalize_branch_revision(revision) + selected = selected_branch_revision(info, branch) + state = info.states.get(selected) + if state == 'dead': + return SelectedInfo('cvs-branch-selects-dead-revision', selected) + if state is None: + return SelectedInfo('cvs-branch-selects-unknown-revision', selected) + cls = 'cvs-branch-selects-live-revision' + try: + blob = git_blob_hash(co_revision(rcs, selected)) + return SelectedInfo(cls, selected, blob) + except RuntimeError as err: + return SelectedInfo(cls, selected, error=str(err)) + + state = info.states.get(revision) + if state == 'dead': + return SelectedInfo('cvs-tag-selects-dead-revision', revision) + if state is None: + return SelectedInfo('cvs-tag-selects-unknown-revision', revision) + cls = 'cvs-tag-selects-live-revision' + try: + blob = git_blob_hash(co_revision(rcs, revision)) + return SelectedInfo(cls, revision, blob) + except RuntimeError as err: + return SelectedInfo(cls, revision, error=str(err)) + + +def collect_symbol_info(cvsroot, records): + needed = defaultdict(set) + for record in records: + if record.symbol is not None and record.kind in ('git-only', + 'cvs-only', 'diff'): + needed[(record.module, record.path)].add(record.symbol) + + info = {} + for (module, path), symbols in needed.items(): + rcs = rcs_path(cvsroot, module, path) + if rcs is None: + for symbol in symbols: + info[(module, path, symbol)] = SelectedInfo('no-rcs-file') + continue + + rcs_info = parse_rcs(rcs, symbols) + for symbol in symbols: + info[(module, path, symbol)] = selected_revision_info( + rcs, rcs_info, symbol) + + return info + + +def collect_wanted_blobs(records, symbol_info): + wanted = defaultdict(set) + for record in records: + if record.symbol is None or record.kind == 'error': + continue + info = symbol_info[(record.module, record.path, record.symbol)] + if info.blob is not None: + wanted[record.module].add(info.blob) + return wanted + + +def git_blob_locations(git_dir, wanted): + locations = defaultdict(set) + if not wanted: + return locations + + proc = subprocess.Popen( + ['git', '--git-dir=' + git_dir, 'rev-list', '--objects', '--all'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding='utf-8', errors='surrogateescape') + assert proc.stdout is not None + for line in proc.stdout: + line = line.rstrip('\n') + if not line: + continue + parts = line.split(' ', 1) + obj = parts[0] + if obj not in wanted: + continue + path = parts[1] if len(parts) == 2 else '' + locations[obj].add(path) + + stderr = proc.stderr.read() if proc.stderr is not None else '' + if proc.wait() != 0: + raise RuntimeError(stderr) + + return locations + + +def collect_git_locations(module_inputs, wanted): + locations = {} + by_module = {item.module: item for item in module_inputs} + for module, blobs in wanted.items(): + locations[module] = git_blob_locations(by_module[module].git_dir, blobs) + return locations + + +def git_blob_at_path(git_dir, ref, path): + proc = subprocess.run( + ['git', '--git-dir=' + git_dir, 'ls-tree', '-z', ref, '--', path], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + raise RuntimeError(proc.stderr.decode('utf-8', 'replace').strip()) + if not proc.stdout: + return None + meta = proc.stdout.split(b'\t', 1)[0].decode('ascii') + return meta.split()[2] + + +def collect_ref_blobs(records, module_inputs): + blobs = {} + by_module = {item.module: item for item in module_inputs} + for record in records: + if record.kind != 'diff': + continue + key = (record.module, record.ref, record.path) + if key in blobs: + continue + blobs[key] = git_blob_at_path( + by_module[record.module].git_dir, record.ref, record.path) + return blobs + + +def git_cause(record, info, locations, ref_blobs): + if info.error is not None: + return 'cvs_selected_revision_checkout_failed' + + if info.cls == 'no-cvs-symbol': + return 'git_ref_contains_path_but_rcs_has_no_requested_symbol' + if info.cls == 'no-rcs-file': + return 'git_ref_contains_path_but_no_rcs_file_was_found' + + if info.cls.endswith('dead-revision'): + return 'git_ref_contains_path_but_cvs_selects_dead_revision' + if info.cls.endswith('unknown-revision'): + return 'cvs_symbol_points_to_revision_missing_from_rcs_admin' + + if info.blob is None: + return 'cvs_selected_live_revision_blob_was_not_computed' + + paths = locations.get(record.module, {}).get(info.blob, set()) + same_path = record.path in paths + if record.kind == 'cvs-only': + if same_path: + return 'cvs_live_blob_reachable_at_same_path_but_not_this_ref' + if paths: + return 'cvs_live_blob_reachable_only_at_other_path' + return 'cvs_live_blob_not_reachable_from_any_git_ref' + + if record.kind == 'diff': + git_blob = ref_blobs.get((record.module, record.ref, record.path)) + if git_blob == info.blob: + return 'numeric_cvs_revision_blob_matches_git_symbolic_checkout_differs' + if same_path: + return 'git_ref_has_different_blob_cvs_blob_reachable_at_same_path' + if paths: + return 'git_ref_has_different_blob_cvs_blob_reachable_elsewhere' + return 'git_ref_has_different_blob_cvs_blob_not_reachable' + + if record.kind == 'git-only': + return 'git_ref_contains_path_but_cvs_does_not_select_live_file' + + return 'unclassified' + + +def classify(record, symbol_info, git_locations, ref_blobs): + if record.kind == 'error': + if record.symbol is None: + return 'head-error', 'verifier_reported_error' + return 'error', 'verifier_reported_error' + + if record.symbol is None: + return 'head-' + record.kind, 'head_comparison_has_no_cvs_symbol' + + info = symbol_info[(record.module, record.path, record.symbol)] + return record.kind + '/' + info.cls, git_cause( + record, info, git_locations, ref_blobs) + + +def summarize(records, symbol_info, git_locations, ref_blobs): + summary = {} + for record in records: + cls, cause = classify(record, symbol_info, git_locations, ref_blobs) + key = (record.module, cls, cause) + if key not in summary: + summary[key] = { + 'lines': 0, + 'refs': set(), + 'paths': set(), + } + item = summary[key] + item['lines'] += 1 + item['refs'].add(record.ref) + item['paths'].add(record.path) + return summary + + +def print_summary(summary): + print('module\tclass\tcause\tlines\trefs\tpaths') + for (module, cls, cause), item in sorted( + summary.items(), key=lambda x: (x[0][0], -x[1]['lines'], x[0][1], + x[0][2])): + print('%s\t%s\t%s\t%d\t%d\t%d' % ( + module, cls, cause, item['lines'], len(item['refs']), + len(item['paths']))) + + +def print_details(filename, records, symbol_info, git_locations, ref_blobs): + with open(filename, 'w', encoding='utf-8', errors='surrogateescape') as file: + print('module\tref\tkind\tpath\tclass\tcause\tcvs_revision\t' + 'cvs_blob\tgit_blob', file=file) + for record in records: + cls, cause = classify(record, symbol_info, git_locations, ref_blobs) + info = None + if record.symbol is not None and record.kind != 'error': + info = symbol_info[(record.module, record.path, record.symbol)] + git_blob = ref_blobs.get((record.module, record.ref, record.path), + '') + print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( + record.module, record.ref, record.kind, record.path, cls, + cause, info.revision if info is not None and + info.revision is not None else '', + info.blob if info is not None and info.blob is not None else '', + git_blob if git_blob is not None else ''), + file=file) + + +def main(): + args = parse_args() + records = [] + + module_inputs = [ + ModuleInput(*values) + for values in zip(args.inputs[0::3], args.inputs[1::3], + args.inputs[2::3]) + ] + + for item in module_inputs: + records.extend(read_records(item.module, item.output_file)) + + symbol_info = collect_symbol_info(args.cvsroot, records) + git_locations = collect_git_locations( + module_inputs, collect_wanted_blobs(records, symbol_info)) + ref_blobs = collect_ref_blobs(records, module_inputs) + print_summary(summarize(records, symbol_info, git_locations, ref_blobs)) + if args.details is not None: + print_details(args.details, records, symbol_info, git_locations, + ref_blobs) + + +if __name__ == '__main__': + main() diff --git a/cvs2gitdump.1 b/cvs2gitdump.1 index 44a4213..7b92696 100644 --- a/cvs2gitdump.1 +++ b/cvs2gitdump.1 @@ -20,8 +20,8 @@ .Nm is a small python script which imports a cvs tree into a git repository. .Nm -has a small footprint, supports incremental imports and converts tags on HEAD. -It also converts all tags and CVS branches to Git branches with +has a small footprint and supports incremental imports. +It also converts CVS branches and tags with .Fl A . It's very fast because the conversion is done in memory. .Pp @@ -32,7 +32,11 @@ By default, the script will only use commits 10 minutes older than the most recent commit because recent commits are not stable if the repository is changing. This option will change this behavior. It will use all the commits. .It Fl A -Convert all tags and CVS branches. +Convert CVS branches and tags. +Branches and tags are reset to generated commits. +No additional commits are synthesized to normalize CVS branch or tag snapshots. +If a CVS symbol name is used as both a branch and a tag in the imported tree, +the symbol is collapsed to a branch and no Git tag is generated for that name. .It Fl b Ar branch The git branch which is used for incremental import. With .Fl A , @@ -82,3 +86,21 @@ $ git --git-dir /git/openbsd.git fast-import < openbsd2.dump .Ed .Sh AUTHORS .An YASUOKA Masahiko. +.Sh CAVEATS +CVS branch and tag symbols are per-file snapshots, while Git branch and tag +references point to a single commit. +.Nm +selects existing generated commits for CVS symbols and does not synthesize +extra commits to make a Git tree exactly match every CVS symbol snapshot. +As a result, some imported branch or tag references can differ from +.Xr cvs 1 +checkouts for files whose CVS symbol revisions do not coexist in a single +generated Git commit. +.Pp +CVS also expands some keywords using the checkout symbol name. +For example, +.Sq $Name$ +can expand differently when a file is checked out with a branch or tag. +.Nm +does not synthesize branch- or tag-specific keyword-only commits, so such +keyword values may differ from CVS checkout output. diff --git a/cvs2gitdump.py b/cvs2gitdump.py index 6b60120..a5b7b40 100644 --- a/cvs2gitdump.py +++ b/cvs2gitdump.py @@ -62,8 +62,10 @@ def main(): convert_all = False existing_branches = set() existing_refs = set() + existing_ref_commits = dict() branch_sources = dict() tag_sources = dict() + branch_tips = dict() try: opts, args = getopt.getopt(sys.argv[1:], 'aAb:hm:z:e:E:k:t:l:') @@ -144,9 +146,13 @@ def main(): git = subprocess.Popen( ['git', '--git-dir=' + args[1], 'for-each-ref', - '--format=%(refname)', 'refs/heads', 'refs/tags'], + '--format=%(refname)%00%(objectname)', 'refs/heads', + 'refs/tags'], encoding='utf-8', stdout=subprocess.PIPE) - existing_refs = set([r.strip() for r in git.stdout.readlines()]) + for line in git.stdout.readlines(): + ref, commit = line.rstrip('\n').split('\x00', 1) + existing_refs.add(ref) + existing_ref_commits[ref] = commit git.wait() if git.returncode != 0: print("Couldn't exec git", file=sys.stderr) @@ -166,6 +172,9 @@ def main(): changesets = sorted(cvs.changesets) nchangesets = len(changesets) + if convert_all: + cvs.select_branch_bases(changesets) + cvs.prepare_tags() print('** cvs has %d changeset' % (nchangesets), file=sys.stderr) if nchangesets <= 0: @@ -173,12 +182,21 @@ def main(): if do_incremental and convert_all: commits = git_commit_map(args[1], git_branch, email_domain) + branch_tips = git_branch_tips( + args[1], existing_branches, changesets, email_domain) branch_sources = git_branch_sources( - args[1], git_branch, cvs, existing_refs, existing_branches, - commits, log_encodings) + args[1], git_branch, cvs, existing_branches, commits, + log_encodings) tag_sources = git_tag_sources( args[1], git_branch, cvs, existing_refs, existing_branches, - commits, log_encodings, last_ctime) + existing_ref_commits, commits, log_encodings, last_ctime, + branch_tips) + cvs.filter_external_source_adjustments( + args[1], branch_sources, tag_sources) + cvs.add_missing_git_ref_adjustments( + args[1], branch_sources, tag_sources) + if convert_all: + cvs.drop_ref_roots() if not dump_all: # don't use last 10 minutes for safety @@ -189,11 +207,20 @@ def main(): found_last_revision = False markseq = cvs.markseq extags = set() + reset_tags = set() commit_marks = dict() initialized_branches = set(existing_branches) + found_branches = set() + started_branches = set() + adjusted_branches = set() + branches_by_base = refs_by_base(cvs.branch_bases) + tags_by_base = refs_by_base(cvs.tags) for k in changesets: if do_incremental and is_cvs_branch(k.branch): - if k.branch in existing_branches: + if k.branch in branch_tips and k.branch not in found_branches: + if changeset_matches_git_tip( + k, branch_tips[k.branch], log_encodings): + found_branches.add(k.branch) continue elif do_incremental and not found_last_revision: if k.min_time == last_ctime and k.author == last_author: @@ -206,77 +233,122 @@ def main(): branch_source = None if do_incremental and is_cvs_branch(k.branch): - branch_source = cvs_branch_source( - k.branch, cvs, commit_marks, existing_refs, branch_sources) + if k.branch in branch_tips: + branch_source = branch_tips[k.branch]['commit'] + else: + branch_source = cvs_branch_source( + k.branch, cvs, commit_marks, branch_sources) if branch_source is None: continue - marks = {} - - for f in k.revs: - if not do_incremental: - marks[f.markseq] = f - else: - markseq = markseq + 1 - git_dump_file(f.path, f.rev, rcs, markseq) - marks[markseq] = f - log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev) - log = decode_log(log, log_encodings).encode('utf-8', 'ignore') - if is_cvs_branch(k.branch): if branch_source is None: branch_source = cvs_branch_source( - k.branch, cvs, commit_marks, existing_refs, branch_sources) + k.branch, cvs, commit_marks, branch_sources) if branch_source is None: continue reset_branch(k.branch, branch_source, initialized_branches) + if k.branch in cvs.branch_adjustments and \ + k.branch not in adjusted_branches: + markseq = emit_partial_pick_commits( + 'refs/heads/%s' % k.branch, + cvs.branch_adjustments[k.branch], do_incremental, rcs, + markseq, log_encodings, email_domain) + adjusted_branches.add(k.branch) + + commit_revs = k.revs + marks = {} + for f in commit_revs: + markseq, mark = mark_file_revision( + f, do_incremental, rcs, markseq) + if mark is not None: + marks[f] = mark output('commit ' + git_ref(k.branch, git_branch)) markseq = markseq + 1 output('mark :%d' % (markseq)) commit_marks[k] = markseq - email = k.author if email_domain is None \ - else k.author + '@' + email_domain - output('author %s <%s> %d +0000' % (k.author, email, k.min_time)) - output('committer %s <%s> %d +0000' % (k.author, email, k.min_time)) - - output('data', len(log)) - output(log, end='') - if do_incremental and git_tip is not None and \ + commit_data = changeset_commit_data( + k, log_encodings, email_domain) + output(b'author ' + commit_data['author']) + output(b'committer ' + commit_data['committer']) + + output('data', len(commit_data['log'])) + output(commit_data['log'], end='') + if do_incremental and k.branch in branch_tips and \ + k.branch not in started_branches: + output('from', branch_tips[k.branch]['commit']) + started_branches.add(k.branch) + elif do_incremental and git_tip is not None and \ not is_cvs_branch(k.branch): output('from', git_tip) git_tip = None - for m in marks: - f = marks[m] - mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644 - fn = file_path(cvs.cvsroot, f.path) + for f in commit_revs: if f.state == 'dead': - output('D', fn) + output('D', f.fn) else: - output('M %o :%d %s' % (mode, m, fn)) + output('M %o :%d %s' % (f.mode(), marks[f], f.fn)) output('') for tag in k.tags: - if tag in extags: + if tag in extags or tag in reset_tags or \ + tag in cvs.collapsed_symbols or \ + tag in cvs.tag_adjustments: + continue + source = ':%d' % commit_marks[k] + reset_tag(tag, source) + for tag in tags_by_base.get(k, []): + if tag in extags or tag in reset_tags or \ + tag in cvs.collapsed_symbols or \ + tag not in cvs.tag_adjustments: continue - output('reset refs/tags/%s' % (tag)) - output('from :%d' % (markseq)) - output('') - for branch, base in list(cvs.branch_bases.items()): - if base is k: - source = cvs_branch_source( - branch, cvs, commit_marks, existing_refs, branch_sources) - if source is not None: - reset_branch(branch, source, initialized_branches) + source = ':%d' % commit_marks[k] + reset_tag(tag, source) + markseq = emit_partial_pick_commits( + 'refs/tags/%s' % tag, cvs.tag_adjustments[tag], + do_incremental, rcs, markseq, log_encodings, email_domain) + reset_tags.add(tag) + for branch in branches_by_base.get(k, []): + source = cvs_branch_source( + branch, cvs, commit_marks, branch_sources) + if source is not None: + reset_branch(branch, source, initialized_branches) + if branch in cvs.branch_adjustments and \ + branch not in adjusted_branches: + markseq = emit_partial_pick_commits( + 'refs/heads/%s' % branch, + cvs.branch_adjustments[branch], do_incremental, rcs, + markseq, log_encodings, email_domain) + adjusted_branches.add(branch) if do_incremental: for tag, source in list(tag_sources.items()): + if tag in cvs.collapsed_symbols: + continue reset_tag(tag, source) + if tag in cvs.tag_adjustments: + markseq = emit_partial_pick_commits( + 'refs/tags/%s' % tag, cvs.tag_adjustments[tag], + do_incremental, rcs, markseq, log_encodings, + email_domain) + reset_tags.add(tag) for branch, source in list(branch_sources.items()): reset_branch(branch, source, initialized_branches) + if branch in cvs.branch_adjustments and \ + branch not in adjusted_branches: + markseq = emit_partial_pick_commits( + 'refs/heads/%s' % branch, cvs.branch_adjustments[branch], + do_incremental, rcs, markseq, log_encodings, + email_domain) + adjusted_branches.add(branch) if do_incremental and not found_last_revision: raise Exception('could not find the last revision') + if do_incremental: + missing = set(branch_tips) - found_branches + if len(missing) > 0: + raise Exception('could not find the last revision for %s' % + ', '.join(sorted(missing))) print('** dumped', file=sys.stderr) @@ -300,14 +372,31 @@ def output(*args, end='\n'): class FileRevision: - def __init__(self, path, rev, state, markseq): + __slots__ = ( + 'path', 'fn', 'rev', 'state', 'markseq', 'git_mode', + 'deleted_later') + + def __init__(self, path, fn, rev, state, markseq, deleted_later=False): self.path = path + self.fn = fn self.rev = rev self.state = state self.markseq = markseq + self.git_mode = None + self.deleted_later = deleted_later + + def mode(self): + if self.git_mode is None: + self.git_mode = 0o100755 if os.access(self.path, os.X_OK) \ + else 0o100644 + return self.git_mode class ChangeSetKey: + __slots__ = ( + 'branch', 'author', 'min_time', 'max_time', 'commitid', 'fuzzsec', + 'revs', 'tags', 'log_hash') + def __init__(self, branch, author, timestamp, log, commitid, fuzzsec): self.branch = branch self.author = author @@ -378,8 +467,10 @@ def merge(self, anot): def __hash__(self): return hash(self.branch + '/' + self.author) * 31 + self.log_hash - def put_file(self, path, rev, state, markseq): - self.revs.append(FileRevision(path, rev, state, markseq)) + def put_file(self, path, fn, rev, state, markseq, deleted_later=False): + f = FileRevision(path, fn, rev, state, markseq, deleted_later) + self.revs.append(f) + return f def _cmp2(a, b): @@ -396,9 +487,19 @@ def __init__(self, cvsroot, rcs, dumpfile, fuzzsec, convert_all=False): self.dumpfile = dumpfile self.markseq = 0 self.tags = dict() + self.tag_roots = dict() self.branch_bases = dict() + self.branch_roots = dict() + self.missing_branch_roots = dict() + self.branch_adjustments = dict() + self.tag_adjustments = dict() + self.missing_tag_roots = dict() + self.collapsed_symbols = set() + self.branch_symbols = set() + self.tag_symbols = set() self.fuzzsec = fuzzsec self.convert_all = convert_all + self.normal_rcs_paths = set() def walk(self, module=None): p = [self.cvsroot] @@ -406,6 +507,7 @@ def walk(self, module=None): p.append(module) path = os.path.join(*p) + rcs_paths = [] for root, dirs, files in os.walk(path): if '.git' in dirs: print('Ignore %s: cannot handle the path named \'.git\'' % ( @@ -418,44 +520,76 @@ def walk(self, module=None): for f in files: if not f[-2:] == ',v': continue - self.parse_file(root + os.sep + f) + rcs_path = root + os.sep + f + rcs_paths.append(rcs_path) + if not is_attic_path(rcs_path): + self.normal_rcs_paths.add(file_path(self.cvsroot, rcs_path)) + + for path in rcs_paths: + self.parse_file(path) - for t, c in list(self.tags.items()): - c.tags.append(t) + self.collapsed_symbols = self.branch_symbols & self.tag_symbols def parse_file(self, path): rtags = dict() rbranches = dict() rcsfile = rcsparse.rcsfile(path) + fn = file_path(self.cvsroot, path) + shadowed_attic = is_attic_path(path) and fn in self.normal_rcs_paths branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} - symbols = list(rcsfile.symbols.items()) + symbols = [] if shadowed_attic else list(rcsfile.symbols.items()) for k, v in symbols: + if self.convert_all and k != 'HEAD': + if is_cvs_branch_symbol(v): + self.branch_symbols.add(k) + else: + self.tag_symbols.add(k) r = v.split('.') if len(r) == 3: branches[v] = 'VENDOR' - elif len(r) >= 3 and r[-2] == '0': - branches['.'.join(r[:-2] + r[-1:])] = k + elif is_cvs_branch_symbol(v): + branch = '.'.join(r[:-2] + r[-1:]) + branches[branch] = k if self.convert_all: b = '.'.join(r[:-2]) if b not in rbranches: rbranches[b] = list() rbranches[b].append(k) - - for k, v in symbols: - r = v.split('.') - if len(r) >= 3 and r[-2] == '0': - continue - branch = branches.get('.'.join(r[:-1])) - if branch == 'HEAD' or (self.convert_all and branch is not None): - if v not in rtags: - rtags[v] = list() - rtags[v].append(k) + if self.convert_all: + for k, v in symbols: + if k == 'HEAD': + continue + r = v.split('.') + if is_cvs_branch_symbol(v): + continue + branch = branches.get('.'.join(r[:-1])) + if branch is not None: + if v not in rtags: + rtags[v] = [] + rtags[v].append(k) revs = rcsfile.revs.items() # sort by revision descending to priorize 1.1.1.1 than 1.1 revs = sorted(revs, key=lambda a: a[1][0], reverse=True) # sort by time revs = sorted(revs, key=lambda a: a[1][1]) + + dead_after_revs = set() + seen_dead = False + for k, v in reversed(revs): + if seen_dead: + dead_after_revs.add(k) + if v[3] == 'dead': + seen_dead = True + + def put_skipped_roots(rev, info): + frev = FileRevision( + path, fn, rev, info[3], 0, rev in dead_after_revs) + self.put_branch_roots(rbranches, rev, frev, None, info[1]) + if rev in rtags: + for tag in rtags[rev]: + self.put_tag_root(tag, frev, None, info[1]) + novendor = False have_initial_revision = False last_vendor_status = None @@ -464,20 +598,25 @@ def parse_file(self, path): if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \ and r[3] == '1': if have_initial_revision: + put_skipped_roots(k, v) continue if v[3] == 'dead': + put_skipped_roots(k, v) continue last_vendor_status = v[3] have_initial_revision = True elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1': if novendor: + put_skipped_roots(k, v) continue last_vendor_status = v[3] elif len(r) == 2: if r[0] == '1' and r[1] == '1': if have_initial_revision: + put_skipped_roots(k, v) continue if v[3] == 'dead': + put_skipped_roots(k, v) continue have_initial_revision = True elif r[0] == '1' and r[1] != '1': @@ -489,6 +628,7 @@ def parse_file(self, path): else: b = '.'.join(r[:-1]) if not self.convert_all or b not in branches: + put_skipped_roots(k, v) continue last_vendor_status = None @@ -505,7 +645,8 @@ def parse_file(self, path): print('Aborted at %s %s' % (path, v[0]), file=sys.stderr) raise e - a.put_file(path, k, v[3], self.markseq) + frev = a.put_file( + path, fn, k, v[3], self.markseq, k in dead_after_revs) while a in self.changesets: c = self.changesets[a] del self.changesets[a] @@ -513,16 +654,303 @@ def parse_file(self, path): a = c self.changesets[a] = a if k in rtags: - for t in rtags[k]: - if t not in self.tags or \ - self.tags[t].max_time < a.max_time: - self.tags[t] = a - if k in rbranches: - for branch in rbranches[k]: - if branch not in self.branch_bases or \ - self.branch_bases[branch].max_time < a.max_time: - self.branch_bases[branch] = a + for tag in rtags[k]: + if tag not in self.tags or \ + self.tags[tag].max_time < a.max_time: + self.tags[tag] = a + self.put_tag_root(tag, frev, a, a.max_time) + self.put_branch_roots(rbranches, k, frev, a, a.max_time) + + def prepare_tags(self): + for tag, changeset in list(self.tags.items()): + if tag in self.collapsed_symbols: + continue + changeset.tags.append(tag) + + def put_branch_roots(self, rbranches, rev, frev, changeset, timestamp): + if rev not in rbranches: + return + + for branch in rbranches[rev]: + item = (changeset, frev, timestamp) + if branch not in self.branch_roots: + self.branch_roots[branch] = [] + self.branch_roots[branch].append(item) + if frev.state != 'dead' and \ + (is_attic_path(frev.path) or frev.deleted_later): + if branch not in self.missing_branch_roots: + self.missing_branch_roots[branch] = dict() + self.missing_branch_roots[branch][frev.fn] = item + + def put_tag_root(self, tag, frev, changeset, timestamp): + item = (changeset, frev, timestamp) + if tag not in self.tag_roots: + self.tag_roots[tag] = dict() + self.tag_roots[tag][frev.fn] = item + if frev.state != 'dead' and \ + (is_attic_path(frev.path) or frev.deleted_later): + if tag not in self.missing_tag_roots: + self.missing_tag_roots[tag] = dict() + self.missing_tag_roots[tag][frev.fn] = item + + def select_branch_bases(self, changesets): + first_branch_changesets = dict() + for changeset in changesets: + if is_cvs_branch(changeset.branch) and \ + changeset.branch not in first_branch_changesets: + first_branch_changesets[changeset.branch] = changeset + + roots_by_branch = dict() + candidates_by_branch = dict() + for branch, roots in list(self.branch_roots.items()): + root_revs = dict() + for item in roots: + root_revs[item[1].fn] = item + + first = first_branch_changesets.get(branch) + candidates = [ + item for item in roots + if item[0] is not None + ] + if first is not None: + before_first = [ + item for item in candidates + if item[2] < first.min_time + ] + if len(before_first) > 0: + candidates = before_first + if len(candidates) == 0: + continue + roots_by_branch[branch] = root_revs + candidates_by_branch[branch] = list(dict.fromkeys([ + item[0] for item in candidates + ])) + + self.score_branch_bases(changesets, roots_by_branch, + candidates_by_branch) + self.branch_adjustments = self.collect_ref_adjustments( + changesets, roots_by_branch, self.branch_bases) + self.tag_adjustments = self.collect_ref_adjustments( + changesets, self.tag_roots, self.tags) + self.branch_roots = dict() + + def score_branch_bases(self, changesets, roots_by_branch, + candidates_by_branch): + score_data = dict() + scores = dict() + + for branch, candidates in list(candidates_by_branch.items()): + roots = roots_by_branch[branch] + scores[branch] = [] + for changeset in candidates: + state_branch = branch_state_name(changeset.branch) + if state_branch not in score_data: + score_data[state_branch] = { + 'candidates': dict(), + 'mismatches': dict(), + 'paths': dict(), + 'state': dict(), + } + data = score_data[state_branch] + if changeset not in data['candidates']: + data['candidates'][changeset] = [] + data['candidates'][changeset].append(branch) + if branch in data['mismatches']: + continue + + mismatches = 0 + for path, item in list(roots.items()): + frev = item[1] + if path not in data['paths']: + data['paths'][path] = [] + data['paths'][path].append((branch, frev)) + if frev.state != 'dead': + mismatches += 1 + data['mismatches'][branch] = mismatches + + for changeset in changesets: + state_branch = branch_state_name(changeset.branch) + if state_branch not in score_data: + continue + + data = score_data[state_branch] + for frev in changeset.revs: + if frev.fn not in data['paths']: + continue + old = data['state'].get(frev.fn) + new = None if frev.state == 'dead' else frev + for branch, root in data['paths'][frev.fn]: + old_match = same_branch_root_state(old, root) + new_match = same_branch_root_state(new, root) + if old_match and not new_match: + data['mismatches'][branch] += 1 + elif not old_match and new_match: + data['mismatches'][branch] -= 1 + if new is None: + data['state'].pop(frev.fn, None) + else: + data['state'][frev.fn] = new + + if changeset not in data['candidates']: + continue + for branch in data['candidates'][changeset]: + scores[branch].append(( + data['mismatches'][branch], changeset)) + + for branch, branch_scores in list(scores.items()): + if len(branch_scores) == 0: + continue + self.branch_bases[branch] = max( + branch_scores, key=lambda item: (-item[0], item[1].max_time) + )[1] + + def collect_ref_adjustments(self, changesets, roots_by_ref, bases_by_ref): + refs_by_state_branch = dict() + adjustments = dict() + + for ref, base in list(bases_by_ref.items()): + if ref not in roots_by_ref: + continue + state_branch = branch_state_name(base.branch) + if state_branch not in refs_by_state_branch: + refs_by_state_branch[state_branch] = { + 'bases': dict(), + 'paths': dict(), + 'state': dict(), + } + data = refs_by_state_branch[state_branch] + if base not in data['bases']: + data['bases'][base] = [] + data['bases'][base].append(ref) + for path, item in list(roots_by_ref[ref].items()): + if path not in data['paths']: + data['paths'][path] = [] + data['paths'][path].append((ref, item)) + + for changeset in changesets: + state_branch = branch_state_name(changeset.branch) + if state_branch not in refs_by_state_branch: + continue + data = refs_by_state_branch[state_branch] + + for frev in changeset.revs: + if frev.fn not in data['paths']: + continue + if frev.state == 'dead': + data['state'].pop(frev.fn, None) + else: + data['state'][frev.fn] = frev + + if changeset not in data['bases']: + continue + for ref in data['bases'][changeset]: + roots = roots_by_ref[ref] + for path, item in sorted(roots.items()): + root = item[1] + current = data['state'].get(path) + if not same_branch_root_state(current, root): + if ref not in adjustments: + adjustments[ref] = [] + adjustments[ref].append(item) + + return adjustments + + def add_missing_git_ref_adjustments( + self, git_dir, branch_sources, tag_sources): + self.add_missing_git_adjustments( + git_dir, self.missing_branch_roots, branch_sources, + self.branch_adjustments) + self.add_missing_git_adjustments( + git_dir, self.missing_tag_roots, tag_sources, + self.tag_adjustments) + + def filter_external_source_adjustments( + self, git_dir, branch_sources, tag_sources): + self.filter_external_adjustments( + git_dir, branch_sources, self.branch_adjustments) + self.filter_external_adjustments( + git_dir, tag_sources, self.tag_adjustments) + + def filter_external_adjustments(self, git_dir, sources, adjustments): + refs_by_source = dict() + for ref, source in list(sources.items()): + if ref not in adjustments: + continue + if source not in refs_by_source: + refs_by_source[source] = [] + refs_by_source[source].append(ref) + + for source, refs in sorted(refs_by_source.items()): + paths = dict() + for ref in refs: + for item in adjustments[ref]: + frev = item[1] + if frev.state == 'dead': + continue + if frev.fn not in paths: + paths[frev.fn] = [] + paths[frev.fn].append((ref, item)) + + kept = dict() + missing = git_missing_paths(git_dir, source, sorted(paths)) + for path in missing: + for ref, item in paths[path]: + if ref not in kept: + kept[ref] = [] + kept[ref].append(item) + + for ref in refs: + if ref in kept: + adjustments[ref] = kept[ref] + else: + adjustments.pop(ref, None) + + def add_missing_git_adjustments( + self, git_dir, roots_by_ref, sources, adjustments): + refs_by_source = dict() + for ref, source in list(sources.items()): + if ref not in roots_by_ref: + continue + if source not in refs_by_source: + refs_by_source[source] = [] + refs_by_source[source].append(ref) + + for source, refs in sorted(refs_by_source.items()): + paths = dict() + for ref in refs: + for item in roots_by_ref[ref].values(): + frev = item[1] + if frev.state == 'dead': + continue + if frev.fn not in paths: + paths[frev.fn] = [] + paths[frev.fn].append((ref, item)) + + missing = git_missing_paths(git_dir, source, sorted(paths)) + for path in missing: + for ref, item in paths[path]: + if self.adjustment_exists(adjustments, ref, item): + continue + if ref not in adjustments: + adjustments[ref] = [] + adjustments[ref].append(item) + + def adjustment_exists(self, adjustments, ref, item): + if ref not in adjustments: + return False + frev = item[1] + for _source, existing, _timestamp in adjustments[ref]: + if existing.fn == frev.fn and existing.rev == frev.rev and \ + existing.state == frev.state: + return True + return False + + def drop_ref_roots(self): + self.branch_roots = dict() + self.tag_roots = dict() + self.missing_branch_roots = dict() + self.missing_tag_roots = dict() def file_path(r, p): if r.endswith('/'): @@ -536,17 +964,60 @@ def file_path(r, p): return path +def is_attic_path(path): + return 'Attic' in path.split('/') + + def is_cvs_branch(branch): return branch not in ('HEAD', 'VENDOR') +def branch_state_name(branch): + if is_cvs_branch(branch): + return branch + return 'HEAD' + + +def same_branch_root_state(current, root): + if root.state == 'dead': + return current is None + if current is None: + return False + return current.rev == root.rev and current.state == root.state + + +def git_missing_paths(git_dir, source, paths): + if len(paths) == 0: + return [] + git = subprocess.Popen( + ['git', '--git-dir=' + git_dir, 'cat-file', '--batch-check'], + encoding='utf-8', stdin=subprocess.PIPE, stdout=subprocess.PIPE) + stdin = ''.join(['%s:%s\n' % (source, path) for path in paths]) + outs = git.communicate(stdin)[0].splitlines() + if git.returncode != 0: + print("Couldn't exec git", file=sys.stderr) + sys.exit(git.returncode) + + missing = [] + for path, line in zip(paths, outs): + parts = line.split(' ', 2) + if len(parts) < 2 or parts[1] != 'blob': + missing.append(path) + return missing + + +def is_cvs_branch_symbol(rev): + r = rev.split('.') + return len(r) >= 3 and r[-2] == '0' + + def git_ref(branch, git_branch): if is_cvs_branch(branch): return 'refs/heads/' + branch return 'refs/heads/' + git_branch -def cvs_branch_source(branch, cvs, commit_marks, existing_refs, branch_sources): +def cvs_branch_source(branch, cvs, commit_marks, branch_sources): if branch in branch_sources: return branch_sources[branch] @@ -554,13 +1025,18 @@ def cvs_branch_source(branch, cvs, commit_marks, existing_refs, branch_sources): if base in commit_marks: return ':%d' % commit_marks[base] - tag = 'refs/tags/%s_BASE' % branch - if tag in existing_refs: - return tag - return None +def refs_by_base(refs): + by_base = dict() + for ref, base in list(refs.items()): + if base not in by_base: + by_base[base] = [] + by_base[base].append(ref) + return by_base + + def reset_branch(branch, source, initialized_branches): if branch in initialized_branches: return @@ -577,8 +1053,154 @@ def reset_tag(tag, source): output('') -def git_branch_sources(git_dir, git_branch, cvs, existing_refs, - existing_branches, commits, log_encodings): +def mark_file_revision(frev, do_incremental, rcs, markseq): + if frev.state == 'dead': + return markseq, None + if not do_incremental and frev.markseq != 0: + return markseq, frev.markseq + markseq = markseq + 1 + git_dump_file(frev.path, frev.rev, rcs, markseq) + return markseq, markseq + + +def emit_partial_pick_commits(ref, adjustments, do_incremental, rcs, markseq, + log_encodings, email_domain): + for source, revs in partial_pick_groups(adjustments): + marks = {} + for frev in revs: + markseq, mark = mark_file_revision( + frev, do_incremental, rcs, markseq) + if mark is not None: + marks[frev] = mark + + output('commit ' + ref) + markseq = markseq + 1 + output('mark :%d' % markseq) + commit_data = partial_pick_commit_data( + source, revs[0], log_encodings, email_domain) + output(b'author ' + commit_data['author']) + output(b'committer ' + commit_data['committer']) + output('data', len(commit_data['log'])) + output(commit_data['log'], end='') + for frev in revs: + if frev.state == 'dead': + output('D', frev.fn) + else: + output('M %o :%d %s' % ( + frev.mode(), marks[frev], frev.fn)) + output('') + + return markseq + + +def partial_pick_groups(adjustments): + groups = [] + for source, frev, timestamp in sorted( + adjustments, + key=lambda item: (item[2], partial_pick_source_key(item[0]), + item[1].fn)): + key = partial_pick_source_key(source) + if len(groups) == 0 or groups[-1][0] != key: + groups.append((key, source, [])) + groups[-1][2].append(frev) + return [(source, revs) for _key, source, revs in groups] + + +def partial_pick_source_key(source): + if source is None: + return '' + return '%s/%d/%d/%s' % ( + source.author, source.min_time, source.max_time, source.log_hash) + + +def partial_pick_commit_data(source, frev, log_encodings, email_domain): + if source is None: + commit_data = file_revision_commit_data( + frev, log_encodings, email_domain) + else: + commit_data = changeset_commit_data( + source, log_encodings, email_domain) + + source_name = partial_pick_source_name(source, frev) + trailer = ('\n\n(this commit was partially cherry picked from %s)\n' % + source_name).encode('utf-8') + commit_data['log'] = commit_data['log'].rstrip(b'\n') + trailer + return commit_data + + +def partial_pick_source_name(source, frev): + if source is not None: + if source.commitid is not None: + return 'CVS commitid %s' % source.commitid + return 'CVS changeset %s@%d' % (source.author, source.min_time) + return 'CVS revision %s' % frev.rev + + +def file_revision_commit_data(frev, log_encodings, email_domain): + rcsfile = rcsparse.rcsfile(frev.path) + rev = rcsfile.revs[frev.rev] + log = decode_log(rcsfile.getlog(rev[0]), log_encodings).encode( + 'utf-8', 'ignore') + email = rev[2] if email_domain is None else rev[2] + '@' + email_domain + author = ('%s <%s> %d +0000' % (rev[2], email, rev[1])).encode('utf-8') + return { + 'author': author, + 'committer': author, + 'log': log, + } + + +def git_tip_info(git_dir, ref, email_domain): + git = subprocess.Popen( + ['git', '--git-dir=' + git_dir, '-c', + 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', + '--format=%ae%x00%ct%x00%H%x00%B', ref], + encoding='utf-8', errors='replace', stdout=subprocess.PIPE) + out = git.stdout.read() + git.wait() + if git.returncode != 0: + print("Couldn't exec git", file=sys.stderr) + sys.exit(git.returncode) + + parts = out.split('\x00', 3) + if len(parts) != 4: + raise Exception('could not read git tip for %s' % ref) + + author, timestamp, commit, log = parts + return { + 'author': strip_email_domain(author, email_domain), + 'time': int(timestamp), + 'commit': commit, + 'log': log.rstrip('\n'), + } + + +def git_branch_tips(git_dir, existing_branches, changesets, email_domain): + branches = set() + for changeset in changesets: + if is_cvs_branch(changeset.branch) and \ + changeset.branch in existing_branches: + branches.add(changeset.branch) + + tips = dict() + for branch in sorted(branches): + tips[branch] = git_tip_info( + git_dir, 'refs/heads/%s' % branch, email_domain) + + return tips + + +def changeset_matches_git_tip(changeset, tip, log_encodings): + if int(changeset.min_time) != tip['time'] or \ + changeset.author != tip['author']: + return False + + return git_key_from_changeset(changeset, log_encodings) == \ + (tip['author'], tip['time'], tip['log']) + + +def git_branch_sources(git_dir, git_branch, cvs, existing_branches, commits, + log_encodings): sources = dict() missing = [] @@ -598,11 +1220,6 @@ def git_branch_sources(git_dir, git_branch, cvs, existing_refs, sources[branch] = commit continue - tag = 'refs/tags/%s_BASE' % branch - if tag in existing_refs: - sources[branch] = tag - continue - missing.append(branch) if len(missing) > 0: @@ -613,19 +1230,20 @@ def git_branch_sources(git_dir, git_branch, cvs, existing_refs, def git_tag_sources(git_dir, git_branch, cvs, existing_refs, - existing_branches, commits, log_encodings, last_ctime): + existing_branches, existing_ref_commits, commits, + log_encodings, last_ctime, branch_tips): sources = dict() missing = [] for tag, changeset in list(cvs.tags.items()): - if 'refs/tags/%s' % tag in existing_refs: + if tag in cvs.collapsed_symbols: continue - if is_cvs_branch(changeset.branch) and \ changeset.branch not in existing_branches: continue ref = git_ref(changeset.branch, git_branch) + source = None if not is_cvs_branch(changeset.branch): key = git_key_from_changeset(changeset, log_encodings) if key in commits: @@ -633,18 +1251,25 @@ def git_tag_sources(git_dir, git_branch, cvs, existing_refs, git_dir, ref, commits[key], int(changeset.max_time)) if source is None: raise Exception('ambiguous tag source for %s' % tag) - sources[tag] = source - continue + elif changeset.branch in branch_tips and \ + changeset.max_time > branch_tips[changeset.branch]['time']: + continue - if changeset.max_time > last_ctime: + if source is None and not is_cvs_branch(changeset.branch) and \ + changeset.max_time > last_ctime: continue - commit = git_commit_before(git_dir, ref, int(changeset.max_time)) - if commit is not None: - sources[tag] = commit + if source is None: + commit = git_commit_before(git_dir, ref, int(changeset.max_time)) + if commit is not None: + source = commit + + if source is None: + missing.append(tag) continue - missing.append(tag) + add_tag_source( + git_dir, existing_refs, existing_ref_commits, sources, tag, source) if len(missing) > 0: raise Exception('could not find tag source for %s' % @@ -653,6 +1278,39 @@ def git_tag_sources(git_dir, git_branch, cvs, existing_refs, return sources +def add_tag_source(git_dir, existing_refs, existing_ref_commits, sources, tag, + source): + tag_ref = 'refs/tags/%s' % tag + if tag_ref in existing_refs and \ + git_ref_matches(git_dir, existing_ref_commits, tag_ref, source): + return + + sources[tag] = source + + +def git_ref_matches(git_dir, ref_commits, ref, source): + return git_ref_commit(git_dir, ref_commits, ref) == \ + git_ref_commit(git_dir, ref_commits, source) + + +def git_ref_commit(git_dir, ref_commits, ref): + if ref in ref_commits: + return ref_commits[ref] + if re.match('^[0-9a-f]{40}$', ref): + return ref + + git = subprocess.Popen( + ['git', '--git-dir=' + git_dir, 'rev-parse', '--verify', + ref + '^{commit}'], + encoding='utf-8', stdout=subprocess.PIPE) + outs = git.stdout.readlines() + git.wait() + if git.returncode != 0: + print("Couldn't exec git", file=sys.stderr) + sys.exit(git.returncode) + return outs[0].strip() + + def git_commit_from_candidates(git_dir, ref, candidates, timestamp): if len(candidates) == 1: return candidates[0] @@ -717,6 +1375,21 @@ def strip_email_domain(author, email_domain): return author +def changeset_commit_data(changeset, log_encodings, email_domain): + log = rcsparse.rcsfile(changeset.revs[0].path).getlog( + changeset.revs[0].rev) + log = decode_log(log, log_encodings).encode('utf-8', 'ignore') + email = changeset.author if email_domain is None \ + else changeset.author + '@' + email_domain + author = ('%s <%s> %d +0000' % + (changeset.author, email, changeset.min_time)).encode('utf-8') + return { + 'author': author, + 'committer': author, + 'log': log, + } + + def git_key_from_changeset(changeset, log_encodings): log = rcsparse.rcsfile(changeset.revs[0].path).getlog( changeset.revs[0].rev) @@ -794,8 +1467,9 @@ def __init__(self): self.rerecomple() def rerecomple(self): - pat = b'|'.join(list(self.rcs_expkw.keys())) + pat = b'|'.join([re.escape(k) for k in self.rcs_expkw.keys()]) self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") + self.re_kw_start = re.compile(b"\\$(" + pat + b")[\\$:]") def add_id_keyword(self, keyword): self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID @@ -826,6 +1500,9 @@ def kflag_get(self, flags): def expand_keyword(self, filename, r): rcs = rcsparse.rcsfile(filename) + return self.expand_rcs_keyword(rcs, filename, r) + + def expand_rcs_keyword(self, rcs, filename, r): rev = rcs.revs[r] mode = self.kflag_get(rcs.expand) @@ -833,87 +1510,113 @@ def expand_keyword(self, filename, r): return rcs.checkout(rev[0]) ret = [] - for line in rcs.checkout(rev[0]).split(b'\n'): - logbuf = None - m = self.re_kw.match(line) - if m is None: - # No RCS Keywords, use it as it is - ret += [line] - continue + lines = rcs.checkout(rev[0]).split(b'\n') + for i, line in enumerate(lines): + has_next_line = i + 1 < len(lines) + if has_next_line and i + 2 == len(lines) and lines[-1] == b'': + has_next_line = False + line, logbuf = self.expand_keyword_line( + rcs, filename, rev, line, mode, has_next_line) + ret += [line] + if logbuf is not None: + ret += [logbuf] + return b'\n'.join(ret) - line0 = b'' - while m is not None: - try: - dsign = m.end(1) + line[m.end(1):].index(b'$') - except ValueError: - break - prefix = line[:m.start(1) - 1] + def expand_keyword_line(self, rcs, filename, rev, line, mode, + has_next_line=False, expand_log=True): + logbuf = None + m = self.re_kw.match(line) + if m is None: + return line, logbuf + + line0 = b'' + while m is not None: + delim = m.end(0) - 1 + shared_dollar = False + try: + if line[delim:delim + 1] == b'$': + dsign = delim + else: + dsign = m.end(0) + line[m.end(0):].index(b'$') + except ValueError: + break + prefix = line[:m.start(1) - 1] + if line[delim:delim + 1] == b':' and \ + self.re_kw_start.match(line[dsign:]): + shared_dollar = True + line = line[dsign:] + else: line = line[dsign + 1:] - line0 += prefix - expbuf = '' - if (mode & self.RCS_KWEXP_NAME) != 0: - expbuf += '$' - expbuf += m.group(1).decode('ascii') - if (mode & self.RCS_KWEXP_VAL) != 0: - expbuf += ': ' + line0 += prefix + expbuf = '' + if (mode & self.RCS_KWEXP_NAME) != 0: + expbuf += '$' + expbuf += m.group(1).decode('ascii') if (mode & self.RCS_KWEXP_VAL) != 0: - expkw = self.rcs_expkw[m.group(1)] - if (expkw & self.RCS_KW_RCSFILE) != 0: - expbuf += filename \ - if (expkw & self.RCS_KW_FULLPATH) != 0 \ - else os.path.basename(filename) - expbuf += " " - if (expkw & self.RCS_KW_REVISION) != 0: - expbuf += rev[0] - expbuf += " " - if (expkw & self.RCS_KW_DATE) != 0: - expbuf += time.strftime( - "%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1])) - if (expkw & self.RCS_KW_MDOCDATE) != 0: - d = time.gmtime(rev[1]) - expbuf += time.strftime( - "%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d) - if (expkw & self.RCS_KW_AUTHOR) != 0: - expbuf += rev[2] - expbuf += " " - if (expkw & self.RCS_KW_STATE) != 0: - expbuf += rev[3] - expbuf += " " - if (expkw & self.RCS_KW_LOG) != 0: - p = prefix - expbuf += filename \ - if (expkw & self.RCS_KW_FULLPATH) != 0 \ - else os.path.basename(filename) - expbuf += " " + expbuf += ': ' + if (mode & self.RCS_KWEXP_VAL) != 0: + expkw = self.rcs_expkw[m.group(1)] + if (expkw & self.RCS_KW_RCSFILE) != 0: + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + if (expkw & self.RCS_KW_REVISION) != 0: + expbuf += rev[0] + expbuf += " " + if (expkw & self.RCS_KW_DATE) != 0: + expbuf += time.strftime( + "%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1])) + if (expkw & self.RCS_KW_MDOCDATE) != 0: + d = time.gmtime(rev[1]) + expbuf += time.strftime( + "%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d) + if (expkw & self.RCS_KW_AUTHOR) != 0: + expbuf += rev[2] + expbuf += " " + if (expkw & self.RCS_KW_STATE) != 0: + expbuf += rev[3] + expbuf += " " + if (expkw & self.RCS_KW_LOG) != 0: + p = prefix + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + if expand_log: + log = rcs.getlog(rev[0]) logbuf = p + ( 'Revision %s %s %s\n' % ( rev[0], time.strftime( "%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])), rev[2])).encode('ascii') - for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'): + for lline in log.rstrip().split(b'\n'): if len(lline) == 0: logbuf += p.rstrip() + b'\n' else: - logbuf += p + lline.lstrip() + b'\n' + logbuf += p + lline + b'\n' if len(line) == 0: logbuf += p.rstrip() + if has_next_line and log.endswith(b'\n\n'): + logbuf += b'\n' + p.rstrip() else: - logbuf += p + line.lstrip() + tail, _tail_logbuf = self.expand_keyword_line( + rcs, filename, rev, line.lstrip(), mode, + False, False) + logbuf += p + tail line = b'' - if (expkw & self.RCS_KW_SOURCE) != 0: - expbuf += filename - expbuf += " " - if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0: - expbuf += " " - if (mode & self.RCS_KWEXP_NAME) != 0: - expbuf += '$' - line0 += expbuf[:255].encode('ascii') - m = self.re_kw.match(line) - - ret += [line0 + line] - if logbuf is not None: - ret += [logbuf] - return b'\n'.join(ret) + if (expkw & self.RCS_KW_SOURCE) != 0: + expbuf += filename + expbuf += " " + if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0: + expbuf += " " + if (mode & self.RCS_KWEXP_NAME) != 0 and \ + not shared_dollar: + expbuf += '$' + line0 += expbuf[:255].encode('ascii') + m = self.re_kw.match(line) + + return line0 + line, logbuf # ---------------------------------------------------------------------- diff --git a/verify-cvs-git.py b/verify-cvs-git.py new file mode 100755 index 0000000..c9a5b25 --- /dev/null +++ b/verify-cvs-git.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 + +import argparse +import fnmatch +import os +import re +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + + +HEAD_REF = 'refs/heads/master' +ATTIC_KEYWORD_PATH = re.compile( + rb'\$(Header|Source): ([^$\n]*)/Attic/([^/$\n]+,v[^$\n]*)\$') + + +def run(cmd, cwd=None, stdout=subprocess.PIPE, check=True): + proc = subprocess.run(cmd, cwd=cwd, stdout=stdout, + stderr=subprocess.PIPE) + if check and proc.returncode != 0: + raise CommandError(cmd, proc) + return proc + + +class CommandError(Exception): + def __init__(self, cmd, proc): + self.cmd = cmd + self.proc = proc + super().__init__(self.message()) + + def message(self): + stderr = self.proc.stderr.decode('utf-8', 'replace').strip() + stdout = (self.proc.stdout or b'').decode('utf-8', 'replace').strip() + msg = 'command failed (%d): %s' % ( + self.proc.returncode, ' '.join(self.cmd)) + if stderr: + msg += '\n' + stderr + if stdout: + msg += '\n' + stdout[:2000] + return msg + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Compare CVS checkout trees against Git refs.') + parser.add_argument('cvsroot') + parser.add_argument('module') + parser.add_argument('git_dir') + parser.add_argument('--symbol-glob', default='OPENBSD_*') + parser.add_argument('--keep-work', action='store_true') + parser.add_argument('--no-head', action='store_true') + return parser.parse_args() + + +def list_refs(git_dir, symbol_glob): + proc = run([ + 'git', '--git-dir=' + git_dir, 'for-each-ref', + '--format=%(refname)', 'refs/heads', 'refs/tags']) + + refs = [] + for ref in proc.stdout.decode().splitlines(): + short = ref.rsplit('/', 1)[-1] + if fnmatch.fnmatchcase(short, symbol_glob): + refs.append(ref) + return sorted(refs) + + +def checkout_head(cvsroot, module, cvs_dir): + shutil.rmtree(cvs_dir, ignore_errors=True) + cvs_dir.parent.mkdir(parents=True, exist_ok=True) + run([ + 'cvs', '-Q', '-R', '-d', cvsroot, 'checkout', '-P', + '-d', str(cvs_dir), module]) + return 'checkout-head' + + +def checkout_symbol(cvsroot, module, symbol, cvs_dir): + shutil.rmtree(cvs_dir, ignore_errors=True) + cvs_dir.parent.mkdir(parents=True, exist_ok=True) + checkout_cmd = [ + 'cvs', '-Q', '-R', '-d', cvsroot, 'checkout', '-P', + '-r', symbol, '-d', str(cvs_dir), module] + proc = run(checkout_cmd, check=False) + if proc.returncode == 0: + return 'checkout' + + # CVS can run out of memory during large tagged checkouts while leaving + # behind a partial tree that a follow-up update can complete. + if cvs_dir.exists(): + update_cmd = ['cvs', '-Q', '-R', 'update', '-dP', '-r', symbol] + update_proc = run(update_cmd, cwd=cvs_dir, check=False) + if update_proc.returncode == 0: + return 'checkout-update-after-checkout-error' + + raise CommandError(checkout_cmd, proc) + + +def tree_files(path): + files = [] + for dirpath, dirnames, filenames in os.walk(path): + dirnames[:] = [d for d in dirnames if d != 'CVS'] + for filename in filenames: + if filename.startswith('.#'): + continue + full = Path(dirpath) / filename + rel = full.relative_to(path).as_posix() + files.append(rel) + return sorted(files) + + +def git_files(git_dir, ref): + proc = run([ + 'git', '--git-dir=' + git_dir, 'ls-tree', '-r', '-z', + '--name-only', ref]) + files = [item.decode('utf-8', 'surrogateescape') + for item in proc.stdout.split(b'\0') if item] + return sorted(files) + + +def extract_git(git_dir, ref, git_dir_out): + shutil.rmtree(git_dir_out, ignore_errors=True) + git_dir_out.mkdir(parents=True) + + archive = subprocess.Popen( + ['git', '--git-dir=' + git_dir, 'archive', '--format=tar', ref], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + tar = subprocess.Popen( + ['tar', '-xf', '-', '-C', str(git_dir_out)], + stdin=archive.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + archive.stdout.close() + _, tar_err = tar.communicate() + archive_err = archive.stderr.read() + archive_rc = archive.wait() + if archive_rc != 0 or tar.returncode != 0: + raise RuntimeError( + 'git archive/tar failed arch=%d tar=%d\n%s\n%s' % ( + archive_rc, tar.returncode, + archive_err.decode('utf-8', 'replace'), + tar_err.decode('utf-8', 'replace'))) + + +def file_bytes(path): + if os.path.islink(path): + data = b'SYMLINK\0' + os.readlink(path).encode( + 'utf-8', 'surrogateescape') + else: + with open(path, 'rb') as file: + data = file.read() + return data + + +def normalize_attic_keyword_paths(data): + # CVS expands path-bearing keywords from the physical RCS file path. + # Files stored in Attic therefore get /Attic/ in $Header$/$Source$, + # while Git keeps the same file at its repository path without Attic. + return ATTIC_KEYWORD_PATH.sub(rb'$\1: \2/\3$', data) + + +def same_content(cvs_data, git_data): + if cvs_data == git_data: + return True + + return normalize_attic_keyword_paths(cvs_data) == \ + normalize_attic_keyword_paths(git_data) + + +def content_diffs(cvs_dir, git_dir_out, files): + diffs = [] + for rel in files: + try: + cvs_data = file_bytes(cvs_dir / rel) + git_data = file_bytes(git_dir_out / rel) + except OSError as err: + diffs.append('%s (%s)' % (rel, err)) + else: + if not same_content(cvs_data, git_data): + diffs.append(rel) + return diffs + + +def print_mismatches(ref, kind, paths): + for path in paths: + print('%s %s %s' % (ref, kind, path), flush=True) + + +def compare_ref(args, ref, symbol, cvs_dir, git_dir_out): + try: + if symbol is None: + checkout_head(args.cvsroot, args.module, cvs_dir) + else: + checkout_symbol(args.cvsroot, args.module, symbol, cvs_dir) + + cvs_paths = tree_files(cvs_dir) + git_paths = git_files(args.git_dir, ref) + if cvs_paths != git_paths: + cvs_path_set = set(cvs_paths) + git_path_set = set(git_paths) + print_mismatches(ref, 'cvs-only', + sorted(cvs_path_set - git_path_set)) + print_mismatches(ref, 'git-only', + sorted(git_path_set - cvs_path_set)) + return False + + extract_git(args.git_dir, ref, git_dir_out) + diffs = content_diffs(cvs_dir, git_dir_out, cvs_paths) + if diffs: + print_mismatches(ref, 'diff', diffs) + return False + + return True + except Exception as err: + print('%s error %s' % (ref, err), flush=True) + return False + + +def main(): + args = parse_args() + refs = list_refs(args.git_dir, args.symbol_glob) + + work_dir = Path(tempfile.mkdtemp(prefix='verify-cvs-git-')) + cvs_dir = work_dir / 'cvs' + git_dir_out = work_dir / 'git' + failures = 0 + + try: + if not args.no_head: + if not compare_ref(args, HEAD_REF, None, cvs_dir, git_dir_out): + failures += 1 + + for ref in refs: + symbol = ref.rsplit('/', 1)[-1] + if not compare_ref(args, ref, symbol, cvs_dir, git_dir_out): + failures += 1 + + return 1 if failures else 0 + finally: + if args.keep_work: + print('kept work dir %s' % work_dir) + else: + shutil.rmtree(work_dir, ignore_errors=True) + + +if __name__ == '__main__': + sys.exit(main())