#!/usr/bin/env python # # Notes: # + dependencies: cvs2cl, pysvn, and textproc/py-xml # # + keywords must be disabled in order to guarantee that repocopies will be # resolved correctly # # + repository must be setup to allow setting properties (need to be able to # set correct patch author and date --- see docs for details # # + repository must have svn:lastupdate set on trunk to allow tracking updates # + I converted ncvs/src with the following command: # cvs2svn -s svn/src --fallback-encoding=latin_1 --cvs-revnums --keywords-off --exclude="old*" --force-branch=phk --exclude=ipfw --exclude=NOORDMARK --exclude=PROTOTYPE --exclude=ipfw_tag --force-branch=WPAUL --force-branch=ache --force-branch=chat --force-branch=xten --force-branch=games --force-branch=gnu --force-branch="libg\+\+" --force-branch=libpcap --force-branch=RELENG_5 --force-branch=RELENG_2_1_0 --force-branch=RELEASE_2_0 --force-branch=BETA_2_0 --force-branch=ALPHA_2_0 --force-branch=V_0_1_2_4 --force-branch=FBSD_1 --force-branch=old_WOLLMAN_MBUF --force-branch=old_RELENG_2_2 --force-branch=old_wollman_polling --force-branch=old_RELENG_2_1_0 --force-branch=old_RELENG_2_0_5 --force-branch=old_OLAH_TTCP --force-branch=old_ALPHA_2_0 --force-branch=old_BETA_1_1 --force-branch=old_BRANCH_1_0 --force-branch=old_V_0_1_2_4 ncvs/src >& cvs2svn.conversion.log.0 # # # cvs2svn -s svn/src --fallback-encoding=latin_1 --cvs-revnums --keywords-off 17262.96s user 7634.67s system 48% cpu 14:13:33.58 total # # Steps: # 1) create repo with cvs2svn # 2) allow setting of revprop # 3) look at svn log's first entry - set the svn:lastupdate to that on trunk # - in a checkout of trunk do: # 1) svn propset svn:lastupdate 2008-03-08T19:33:34.000000Z "." # 2) svn commit "." -m "update svn:lastupdate for cvs2svnupdate" # svn propset svn:author --revprop -r 177905 marius file:///bucket/public/devel/svn/src # # + works fastest if repos are in memory -- mount -o size=foo dummy /tmp # # + must have disk space for a copy of each branch that will be patched # # + usage: see usage() # # + can take a *long* time on the first execution to checkout a working copy of all # the branches that need updating - following that - executing cvs2cl is the bottleneck # when there aren't many months worth of changesets # # + for handling years of changesets the xml manipulation would need to be changed to work out # of core - cvs2svnupdate is intended as a follow on cvs2svn, i.e. its intended to keep # a converted repository up to date -- not to actually convert # # BUGS: # - cvs2svnupdate doesn't currently handle new tags # - cvs2svnupdate has support for correcting for repo copies - but it is untested # (in other words it probably doesn't work) from xml.dom.ext.reader.Sax2 import FromXmlFile import sys, os, string, time, calendar, shelve, md5, datetime import pysvn lastdatestr = "" class PatchObject: def __init__(self): self.deadFiles = [] self.oldFiles = [] self.newFiles = [] self.ambigFiles = [] self.author = "" self.branches = [] self.costrings = [] self.date = 0 class WorkingBranch: def __init__(self, path, repo, client): self.path = path self.repo = repo + "/" self.client = client self.digests = {} self.hashUpToDate = False self.revision = None def update(self): self.client.update(self.path) def trimDeadFiles(self, patch): for deadFile in patch.deadFiles: self.client.remove(self.path + deadFile) def updateDigestHash(self): if self.hashUpToDate == True: return print "update digest hash ... wait" infolist = self.client.info2(self.path) for file, svninfo in infolist: wc_info = svninfo["wc_info"] if wc_info != None: checksum = wc_info["checksum"] filebits = file.split("/") #print "set " + str(checksum) + " to " + "/".join(filebits[2:len(filebits)]) self.digests[str(checksum)] = "/".join(filebits[2:len(filebits)]) self.hashUpToDate = True def prefixFile(self, fileName): return self.path + fileName def getRepoCopiedFiles(self, patch): movedFiles = [] movedFilesStr = [] for srcfile, rev in patch.ambigFiles: try: self.client.list(self.repo + "/" + srcfile) patch.oldFiles.append((srcfile, rev)) except: movedFiles.append((srcfile, rev)) movedFilesStr.append(srcfile) for srcfile in patch.deadFiles: try: self.client.list(self.repo + srcfile) except: movedFiles.append((srcfile, rev)) movedFilesStr.append(srcfile) return movedFiles def moveRepoCopiedFiles(self, patch, copiedFiles): self.updateDigestHash() movedFiles = [] print copiedFiles for srcFile, rev in copiedFiles: try: prevFileStr = getPrevFile(srcFile, rev) if prevFileStr == "": continue csum = md5.new(prevFileStr).digest() oldFile = self.digests[str(csum)] except: print "I can't find a file with a matching MD5 for ", srcFile print "may simply lose history" continue newFile = self.path + srcFile client.move(self.path + oldFile, newFile) movedFiles.append(newFile) workingBranch.digests[csum] = srcFile if len(movedFiles) != 0: self.client.checkin(movedFiles, "automatic repo copy adjustment") else: return # set date to correspond to when the change occurred so that the latest date on # the repo doesn't get confused (tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec, tm_wday, tm_yday, tm_isdst) = \ time.localtime(float(patch.date)) dobj = datetime.datetime(tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec) datestr = dobj.isoformat() + ".000000Z" self.client.revpropset("svn:date", datestr, self.repo, revision) self.client.update(movedFiles) def applyPatch(self, patch): cwd = os.environ['PWD'] try: tmpdir = os.environ["TMPDIR"] except: tmpdir = "/tmp" print patch.costrings os.chdir(tmpdir) for costr in patch.costrings: os.system(costr) os.chdir(cwd) os.chdir(self.path) oldFiles = [] for filename, rev in patch.oldFiles: oldFiles.append(filename) print "updating " + str(oldFiles) self.client.update(oldFiles + patch.deadFiles) os.system("find " + tmpdir + "/src/ -name CVS | xargs rm -rf") os.system("cp -r " + tmpdir + "/src/* .") os.system("rm -rf " + tmpdir + "/src") for filename, rev in patch.oldFiles: self.client.propset("cvs2svn:cvs-rev", rev, filename) try: fo = open(filename, "r") filestr = fo.read(1000000000000) fo.close() csum = md5.new(filestr).digest() self.digests[str(csum)] = filename except: None # pull in checksums for newly added files for filename in patch.newFiles: try: fo = open(filename, "r") filestr = fo.read(1000000000000) fo.close() csum = md5.new(filestr).digest() self.digests[str(csum)] = filename except: None os.chdir(cwd) def addNewDirectories(self, patch): addDirs = [] for ambigFile, rev in patch.ambigFiles: try: self.client.list(self.repo + ambigFile) patch.oldFiles.append((ambigFile, rev)) except: patch.newFiles.append(ambigFile) for newFile in patch.newFiles: filebits = newFile.split("/") for i in range(1, len(filebits) + 1): filepath = "/".join(filebits[0:i]) try: self.client.list(self.repo + filepath) except: if filepath not in addDirs: try: self.client.add(self.path + filepath) addDirs.append(filepath) break except: print "failed add " + self.path + filepath break # although 'svn add' is recursive - one has to checkin the same # set of paths that one added, so here we keep track for checkin return addDirs def checkin(self, patch, addDirs): unprefixedFiles = patch.deadFiles + addDirs for srcFile, rev in patch.oldFiles: unprefixedFiles.append(srcFile) commitFiles = map(self.prefixFile, unprefixedFiles) print "committing ", commitFiles return self.client.checkin(commitFiles, patch.msg) def updateRevProperties(self, patch, revision): self.client.revpropset("svn:author", patch.author, self.repo, revision) (tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec, tm_wday, tm_yday, tm_isdst) = \ time.gmtime(float(patch.date)) dobj = datetime.datetime(tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec) datestr = dobj.isoformat() + ".000000Z" #set it with UTC self.client.revpropset("svn:date", datestr, self.repo, revision) # return localtime (tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec, tm_wday, tm_yday, tm_isdst) = \ time.localtime(float(patch.date)) dobj = datetime.datetime(tm_year, tm_month, tm_mday, tm_hour, tm_min, tm_sec) datestr = dobj.isoformat() + ".000000Z" return datestr def updateDone(workdir, datestr, client): cwd = os.environ['PWD'] os.chdir(workdir + "/HEAD") client.update(".", False) print "setting date to " + datestr client.propset("svn:lastupdate", datestr, ".") client.checkin(".", "update svn:lastupdate for cvs2svnupdate") os.chdir(cwd) def generateDate(nodeList): for subnode in nodeList: datestr = subnode.data datetuple = time.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ") datesecs = time.mktime(datetuple) return float(datesecs) def generateAuthor(nodeList): for subnode in nodeList: return subnode.data def generateMsg(nodeList): for subnode in nodeList: return subnode.data.encode("iso8859") def getText(nodeList): for subnode in nodeList: return subnode.data def generateDiffString(nodeList, serialObject): retstr = "" for subnode in nodeList: if subnode.nodeType == subnode.ELEMENT_NODE: if subnode.tagName == "name": name = getText(subnode.childNodes) elif subnode.tagName == "revision": revision = getText(subnode.childNodes) elif subnode.tagName == "cvsstate": state = getText(subnode.childNodes) elif subnode.tagName == "branch": branchname = getText(subnode.childNodes) serialObject.branches.append(branchname) digits = revision.split(".") if state == "dead": serialObject.deadFiles.append(name) else: retstr = "-r" + revision + " src/" + name + " > /dev/null 2>&1" serialObject.ambigFiles.append((name, revision)) return retstr def generatePatch(entryNodeList): cs = "cvs co -kk " costrings = [] patchObject = PatchObject() for subnode in entryNodeList: if subnode.nodeType == subnode.ELEMENT_NODE: if subnode.tagName == "isoDate": patchObject.date = generateDate(subnode.childNodes) elif subnode.tagName == "author": patchObject.author = generateAuthor(subnode.childNodes) elif subnode.tagName == "msg": patchObject.msg = generateMsg(subnode.childNodes) elif subnode.tagName == "file": costrings.append(cs + generateDiffString(subnode.childNodes, patchObject)) patchObject.costrings = costrings return patchObject def ChangesetToCommits(nodeList, patchlog): for subnode in nodeList: if subnode.nodeType == subnode.ELEMENT_NODE: if subnode.tagName == "entry": patch = generatePatch(subnode.childNodes) patchlog[str(patch.date)] = patch else: ChangesetToCommits(subnode.childNodes, patchlog) def setupWorkingBranches(patchlog, repo, workingDirectory, client, skip): branchdirs = os.listdir(workingDirectory) branchesUsed = [] workingBranches = {} for patch in patchlog.values(): if len(patch.branches) == 0: branchesUsed.append("HEAD") else: for branch in patch.branches: branchesUsed.append(branch) branchesUsed = list(set(branchesUsed)) for branch in branchesUsed: workingPath = workingDirectory + "/" + branch + "/" # ensure that we're in a known good state if skip: print "assuming branches are up to date" else: print "ensure consistent state --- updating in " + branch client.update(workingPath) # expensive #client.revert(workingPath, True) if branch == "HEAD": workingRepo = repo + "/trunk/" else: workingRepo = repo + "/branches/" + branch workingBranches[branch] = WorkingBranch(workingPath, workingRepo, client) if branch not in branchdirs: print "the " + branch + " was not found in " + workingDirectory + "..." print "checking out", branch client.checkout(workingRepo, workingPath) return workingBranches def applyPatchlog(patchlog, workingBranches, client): patchEntries = patchlog.items() patchEntries.sort() patchEntries.reverse() client = pysvn.Client() datestr = "" while len(patchEntries) != 0: addDirs = [] (date, patch) = patchEntries.pop() print "date = ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(date))) if len(patch.branches) == 0: branches = ["HEAD"] else: branches = list(set(patch.branches)) for branch in branches: workingBranch = workingBranches[branch] # cope with any repo copies ... grrr print "workingBranchRepo = ", workingBranches[branch].repo copiedFiles = workingBranch.getRepoCopiedFiles(patch) if len(copiedFiles) != 0: workingBranch.moveRepoCopiedFiles(patch, copiedFiles) # remove deleted files, patch, add new files, then commit if len(patch.deadFiles) != 0: workingBranch.trimDeadFiles(patch) workingBranch.applyPatch(patch) addDirs = workingBranch.addNewDirectories(patch) # this check can be confused by having a file be deleted and re-added #if len(addDirs) == 0 and len(patch.newFiles) != 0: # print "new files but no addDirs !!!!!!" # print "giving up" # sys.exit(1) revision = workingBranch.checkin(patch, addDirs) if revision == None: print "checkin failed" return "" # update date and author datestr = workingBranch.updateRevProperties(patch, revision) return datestr def getPrevFile(name, revision): cwd = os.environ['PWD'] try: tmpdir = os.environ["TMPDIR"] except: tmpdir = "/tmp" os.chdir(tmpdir) revList = revision.split(".") lastDigit = int(revList[-1]) lastDigit -= 1 revList[-1] = str(lastDigit) revPrev = ".".join(revList) os.system("cvs co -kk -r" + revPrev + " src/" + name) try: fo = open("src/" + name, "r") except: return "" filestr = fo.read(1000000000000) fo.close() os.unlink("src/" + name) os.chdir(cwd) return filestr import xml.sax._exceptions import getopt def usage(): print "cvs2svnupdate --repo --workdir --cvsdir " print "[--skip-new-changelog --skip-cvs-update --skip-svn-update]" if __name__ == '__main__': flags = ["repo=", 'workdir=', 'cvsdir=', 'skip-new-changelog', 'skip-cvs-update', 'skip-svn-update', 'early-exit'] try: opts, args = getopt.getopt(sys.argv[1:], "", flags) except getopt.GetoptError, err: # print help information and exit: print str(err) usage() sys.exit(2) repo = None workingDirectory = None cvsDirectory = None nochangelog = False nocvsupdate = False nosvnupdate = False earlyexit = False for o, a in opts: if o == "--repo": repo = a elif o == "--workdir": workingDirectory = a elif o == "--cvsdir": cvsDirectory = a elif o == "--skip-new-changelog": nochangelog = True elif o == "--skip-cvs-update": nocvsupdate = True elif o == "--skip-svn-update": nosvnupdate = True elif o == "--early-exit": earlyexit = True cvsroot = os.getenv("CVSROOT") if repo == None or workingDirectory == None or cvsDirectory == None: print "missing argument" usage() if cvsroot == None: print "CVSROOT not set" sys.exit(1) try: os.stat(workingDirectory) os.stat(cvsDirectory) except OSError, data: print "invalid argument ", data sys.exit(1) try: tmpdir = os.environ['TMPDIR'] except: tmpdir = "/tmp/" # get date of last revision client = pysvn.Client() # try: datestr = client.propget("svn:lastupdate", repo + "/trunk").values()[0] datetuple = time.strptime(datestr, "%Y-%m-%dT%H:%M:%S.000000Z") headtime = time.mktime(datetuple) # except data: # print data # print "failed to get svn:lastupdate from " + repo + "/trunk" # loghead = client.log(repo, # revision_start=pysvn.Revision( pysvn.opt_revision_kind.head ), # revision_end=pysvn.Revision( pysvn.opt_revision_kind.number, 0 ), # discover_changed_paths=True, strict_node_history=True, limit=1) # headtime = loghead[0].date headtime += 1 print "starting at ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) localtime = float(headtime) print "generating changesets starting at ", \ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(localtime)) # generate changelog starting 1 second afterwards clstr = time.strftime("-d>%Y-%m-%d %H:%M:%S", time.localtime(localtime)) cwd = os.environ['PWD'] os.chdir(cvsDirectory) if nocvsupdate == False: print "[Phase 1: Updating cvs ...]" sec0 = calendar.timegm(time.localtime()) os.system("cvs -q update -Pd > /dev/null 2>&1") sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" else: print "[Phase 1: Skipping cvs update ...]" sec1 = calendar.timegm(time.localtime()) sec0 = sec1 if nochangelog: print "[Phase 2: skipping Changelog generation ... using previous]" sec1 = calendar.timegm(time.localtime()) else: os.system("rm ChangeLog > /dev/null 2>&1") print "[Phase 2: generating ChangeLog]" sysstr = 'cvs2cl --xml --xml-encoding ISO-8859-1 -l "' + clstr + '"' print sysstr # disable for faster testing os.system(sysstr) sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" sec0 = sec1 print "[Phase 3: Parsing Changelog]" # parse into in-core representation try: doc = FromXmlFile("ChangeLog", validate=0) os.unlink(tmpdir + "pendingCommits") except xml.sax._exceptions.SAXParseException, data: print "parse error ", data except: None sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" sec0 = sec1 patchlog = shelve.open(tmpdir + "pendingCommits", 'n') print "[Phase 4: Converting ChangeLog to commits]" # convert in to usable serialized representation ChangesetToCommits(doc.childNodes, patchlog) patchlog.sync() os.chdir(cwd) sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" sec0 = sec1 print "[Phase 5: Setting up branches in working directory]" # determine which branches are in use and create checkouts if not present workingBranches = setupWorkingBranches(patchlog, repo, workingDirectory, client, nosvnupdate) sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" sec0 = sec1 if earlyexit: sys.exit(0) print "[Phase 6: Committing patches]" # commit patches datestr = applyPatchlog(patchlog, workingBranches, client) if datestr != "": updateDone(workingDirectory, datestr, client) sec1 = calendar.timegm(time.localtime()) print str(sec1 - sec0) + " seconds elapsed" print "Phase 7: cleaning up" # cleanup left over files os.unlink(tmpdir + "pendingCommits") os.system("rm -rf " + tmpdir + "src") #os.unlink(cvsDirectory + "/ChangeLog") print "finished at ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())