--AqsLC8rIMeq19msA
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

This is a late reply to a thread of Tomasz Chmielewsk

http://lists.samba.org/archive/rsync...er/013878.html

He asked if some workaround existed for rename of big files.

I've made a script (python needed on both sides) that links the guessed
files in the right directory and uses the --fuzzy feature of rsync. It
is used to mirror a 100Gb archive where files are often renamed.

Maybe the algorithm could be included in the main rsync codebase. I
never understood why the --fuzzy feature was restricted to the same
directory.

--
Antony Lesuisse

--AqsLC8rIMeq19msA
Content-Type: text/x-python; charset=us-ascii
Content-Disposition: attachment; filename="rsyncpp.py"

#!/usr/bin/python
#
# Rsync pre-processor (handles renames)
#
# 2004 - Antony Lesuisse - Public domain
#
import optparse,os,pickle,random,signal,stat,sys

#----------------------------------------------------------
# Remote ssh side
#----------------------------------------------------------
remote_code="""
import os,pickle,sys
src_dir='SRC_DIR'
src_send=[]
if os.path.isdir(src_dir):
os.chdir(src_dir)
for i in os.popen("find . -type f -size +128k -print0").read().split("\\x00")[:-1]:
s=os.stat(i)
src_send.append({'name':i, 'size':s.st_size, 'time':int(s.st_mtime)})
sys.stdout.write(pickle.dumps(src_send))
"""

def remote_list(src,ssh):
src_host,src_dir=src.split(":",1)
(pin,pout)=os.popen2(r'%s -C "%s" "python -c \"import sys;exec sys.stdin;\" "'%(ssh,src_host))
pin.write(remote_code.replace("SRC_DIR",src_dir))
pin.close()
tmp=""
print "rsyncpp: Reading remote list:"
signal.signal(signal.SIGALRM, lambda x,y:sys.exit())
signal.alarm(120)
while 1:
buf=pout.read(8192)
signal.alarm(30)
if len(buf):
tmp+=buf
sys.stdout.write("\r%d "%len(tmp))
sys.stdout.flush()
else:
print "done"
break
signal.alarm(0)
src_list=pickle.loads(tmp)
pout.close()
return src_list

#----------------------------------------------------------
# pre process rsync
#----------------------------------------------------------
def local_side(src,dest,ssh,rsync):
if os.path.isdir(dest):
os.chdir(dest)
print "rsyncpp: Waiting for remote list..."
src_list=remote_list(src,ssh)
dest_name={}
dest_look={}
for i in os.popen("find . -type f -size +128k -print0").read().split("\x00")[:-1]:
s=os.stat(i)
f={'name':i, 'size':s.st_size, 'time':int(s.st_mtime) }
dest_name[i]=f
if dest_look.has_key(s.st_size):
dest_look[s.st_size].append(f)
else:
dest_look[s.st_size]=[f]
for i in dest_look:
dest_look[i].sort(lambda x,y:cmp(x['time'],y['time']))
print "processing: %d files"%len(src_list)
linked={}
for i in src_list:
cf=i['name']
cs=i['size']
ct=i['time']
if not os.path.isfile(cf):
if dest_look.has_key(cs):
fl=dest_look[cs]
dist=[(abs(i['time']-ct),i) for i in fl]
dist.sort(lambda x,y:cmp(x[0],y[0]))
guess=dist[0][1]
print "dest: '%s' missing, guessed '%s'"%(cf,guess['name'])
(fdir,fname)=os.path.split(cf)
if not os.path.isdir(fdir):
os.makedirs(fdir)
os.link(guess['name'],cf+".fuzzy")
linked[cf]=1
else:
print "dest: '%s' missing, no match guessed "%i['name']
else:
pass
cmd=r'rsync -avz %s --fuzzy --delete-after -e "%s" "%s" "%s"'%(rsync,ssh,src,dest)
print cmd
os.system(cmd)
if __name__ == '__main__':
if len(sys.argv)<3:
print "usage: %s [all rsync options(avz implied)...] login@srchost:srcdir/ localdir/"%sys.argv[0]
else:
ssh="ssh"
rsync=""
arg=sys.argv[1:-2]
while 1:
if len(arg)==0:
break
i=arg.pop(0)
if i=="-e":
ssh=arg.pop(0)
else:
rsync+=" '%s'"%i
src=sys.argv[-2]
dest=sys.argv[-1]
local_side(src,dest,ssh,rsync)


--AqsLC8rIMeq19msA
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

--
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
--AqsLC8rIMeq19msA--