yne/web2fs.py

## web2fs.py
#!/usr/bin/env python
import os,re,sys,json,errno,urllib2,urllib,threading

from fuse import FUSE, FuseOSError, Operations

class MyWebsite(Operations):
	def __init__(self, root):
		self.root = root
		self.files = {}
		self.cache = {}
		self.ua = {'User-Agent':'Mozilla/5.0' }
		self.routes = [
			{'path':'^/$'                            ,'url':'community'             ,'find':'ref="/c/[-a-z_]+?/','map':lambda a:a[8:-1]},
			{'path':'^/([-a-z_]+)_videos$'           ,'url':'c/%s_videos/videos'    ,'find':'sample-video=".*?"','map':lambda a:urllib.quote('http:'+a[14:-1], safe='')},
			{'path':'^/([-a-z_]+)$'                  ,'url':'c/%s/albums'           ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
			{'path':'^/.*?/([-a-z_0-9]+_\d+)$'       ,'url':'c/-/pictures/album/%s' ,'find':'src="//[ic].+?"'   ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
			{'path':'^/([-a-z_]+)/(\+[-+a-z_0-9]+)$' ,'url':'c/%s/albums/tagged/%s' ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
			{'path':'^/([-a-z_]+)/([-a-z_0-9]+)$'    ,'url':'c/%s/pictures/frontpage/0/text/%s' ,'find':'src="//[ic].+?"'   ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
		]
	def fetch_thread(self,path,target,fh,attrs):
		url = urllib.unquote(os.path.basename(path))
		target[fh] = ""
		try:
			target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
			attrs[path]['st_size']=len(target[fh])
			try:
				url = url.replace('.100x100','.315x0')
				target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
				attrs[path]['st_size']=len(target[fh])
			except urllib2.HTTPError,e:
				pass
			url = url.replace('.315x0','')
			target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
			attrs[path]['st_size']=len(target[fh])
		except urllib2.HTTPError,e:
			print "ERROR>>>",e,url,len(target[fh])
			if len(target[fh])==0:
				del target[fh]
	def fetch(self, path, page=1):
		for r in self.routes:
			g = re.match(r['path'],path)
			if not g:continue
			url = 'http://'+self.root+'/'+r['url']%g.groups()+'/page/%d/'%page
			try:
				html = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
				link = map(r['map'],re.findall(r['find'],html))
				return sorted(set(link), key=lambda x: link.index(x))
			except urllib2.HTTPError,e:
				print e,url
				break
	#STRUCTURE OPERATION
	def getattr(self, path, fh=None):
		#attrs = ('st_atime', 'st_ctime', 'st_gid', 'st_mode', 'st_mtime', 'st_nlink', 'st_size', 'st_uid')
		if not self.cache.has_key(path):
			return {'st_mode': 040666, 'st_size': 0, 'f':[], 'page':0}
		return self.cache[path]
	def readdir(self, path, fh):
		#only handle visible files
		if '.' not in path[:2]:
			#send the 2 mandatory nodes
			for dots in ['.', '..']:yield dots;
			#the dir is on RAM : send it all
			if path in self.cache and 'complete' in self.cache[path]:
				for d in self.cache[path]['f']:yield d.strip("/")
			else:#the dir doesn't exist or is unfinished
				if path not in self.cache:
					self.cache[path]={'st_mode': 040444, 'st_size': 0,'f':[],'page':0}
				#the dir now exist, fill+yield it while we can and mark it as complete
				for i in xrange(1,20):
					self.cache[path]['page'] += 1
					results = self.fetch(path,self.cache[path]['page']) or []
					#print "FETCH>>>",path,self.cache[path]['page'],len(results)
					if not results:break
					for r in results:
						self.cache[path]['f'].append(r)
						self.cache[os.path.join(path,r)] = {'st_mode': 0100666 if r[:4]=='http' else 040666, 'st_size': 4, 'f':[], 'page':0}
						yield r
				self.cache[path]['complete']=True
	#FILE OPERATION
	def open(self, path, flags):
		fh=hash(path)&0x0FFFFFFFL
		if not fh in self.files:
			threading.Thread(target = self.fetch_thread, args = (path,self.files,fh,self.cache)).start()
		return fh
	def read(self, path, length, offset, fh):
		f = self.files.get(fh,'')
		return f[offset:offset+length]
	def release(self, path, fh):
		#self.cache[path]['thread'].join(1)
		#self.files[fh]=""
		return 0

if __name__ == '__main__':
	FUSE(MyWebsite(sys.argv[1]), sys.argv[2], nothreads=True, foreground=True)
	#!/usr/bin/env python
	import os,re,sys,json,errno,urllib2,urllib,threading

	from fuse import FUSE, FuseOSError, Operations

	class MyWebsite(Operations):
	def __init__(self, root):
	self.root = root
	self.files = {}
	self.cache = {}
	self.ua = {'User-Agent':'Mozilla/5.0' }
	self.routes = [
	{'path':'^/$' ,'url':'community' ,'find':'ref="/c/[-a-z_]+?/','map':lambda a:a[8:-1]},
	{'path':'^/([-a-z_]+)_videos$' ,'url':'c/%s_videos/videos' ,'find':'sample-video=".*?"','map':lambda a:urllib.quote('http:'+a[14:-1], safe='')},
	{'path':'^/([-a-z_]+)$' ,'url':'c/%s/albums' ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
	{'path':'^/.*?/([-a-z_0-9]+_\d+)$' ,'url':'c/-/pictures/album/%s' ,'find':'src="//[ic].+?"' ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
	{'path':'^/([-a-z_]+)/(\+[-+a-z_0-9]+)$' ,'url':'c/%s/albums/tagged/%s' ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
	{'path':'^/([-a-z_]+)/([-a-z_0-9]+)$' ,'url':'c/%s/pictures/frontpage/0/text/%s' ,'find':'src="//[ic].+?"' ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
	]
	def fetch_thread(self,path,target,fh,attrs):
	url = urllib.unquote(os.path.basename(path))
	target[fh] = ""
	try:
	target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
	attrs[path]['st_size']=len(target[fh])
	try:
	url = url.replace('.100x100','.315x0')
	target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
	attrs[path]['st_size']=len(target[fh])
	except urllib2.HTTPError,e:
	pass
	url = url.replace('.315x0','')
	target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
	attrs[path]['st_size']=len(target[fh])
	except urllib2.HTTPError,e:
	print "ERROR>>>",e,url,len(target[fh])
	if len(target[fh])==0:
	del target[fh]
	def fetch(self, path, page=1):
	for r in self.routes:
	g = re.match(r['path'],path)
	if not g:continue
	url = 'http://'+self.root+'/'+r['url']%g.groups()+'/page/%d/'%page
	try:
	html = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
	link = map(r['map'],re.findall(r['find'],html))
	return sorted(set(link), key=lambda x: link.index(x))
	except urllib2.HTTPError,e:
	print e,url
	break
	#STRUCTURE OPERATION
	def getattr(self, path, fh=None):
	#attrs = ('st_atime', 'st_ctime', 'st_gid', 'st_mode', 'st_mtime', 'st_nlink', 'st_size', 'st_uid')
	if not self.cache.has_key(path):
	return {'st_mode': 040666, 'st_size': 0, 'f':[], 'page':0}
	return self.cache[path]
	def readdir(self, path, fh):
	#only handle visible files
	if '.' not in path[:2]:
	#send the 2 mandatory nodes
	for dots in ['.', '..']:yield dots;
	#the dir is on RAM : send it all
	if path in self.cache and 'complete' in self.cache[path]:
	for d in self.cache[path]['f']:yield d.strip("/")
	else:#the dir doesn't exist or is unfinished
	if path not in self.cache:
	self.cache[path]={'st_mode': 040444, 'st_size': 0,'f':[],'page':0}
	#the dir now exist, fill+yield it while we can and mark it as complete
	for i in xrange(1,20):
	self.cache[path]['page'] += 1
	results = self.fetch(path,self.cache[path]['page']) or []
	#print "FETCH>>>",path,self.cache[path]['page'],len(results)
	if not results:break
	for r in results:
	self.cache[path]['f'].append(r)
	self.cache[os.path.join(path,r)] = {'st_mode': 0100666 if r[:4]=='http' else 040666, 'st_size': 4, 'f':[], 'page':0}
	yield r
	self.cache[path]['complete']=True
	#FILE OPERATION
	def open(self, path, flags):
	fh=hash(path)&0x0FFFFFFFL
	if not fh in self.files:
	threading.Thread(target = self.fetch_thread, args = (path,self.files,fh,self.cache)).start()
	return fh
	def read(self, path, length, offset, fh):
	f = self.files.get(fh,'')
	return f[offset:offset+length]
	def release(self, path, fh):
	#self.cache[path]['thread'].join(1)
	#self.files[fh]=""
	return 0

	if __name__ == '__main__':
	FUSE(MyWebsite(sys.argv[1]), sys.argv[2], nothreads=True, foreground=True)