Skip to content

Instantly share code, notes, and snippets.

@yne
Last active February 27, 2016 10:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yne/599420215e8df4bb8054 to your computer and use it in GitHub Desktop.
Save yne/599420215e8df4bb8054 to your computer and use it in GitHub Desktop.
Website as filesystem testing
#!/usr/bin/env python
import os,re,sys,json,errno,urllib2,urllib,threading
from fuse import FUSE, FuseOSError, Operations
class MyWebsite(Operations):
def __init__(self, root):
self.root = root
self.files = {}
self.cache = {}
self.ua = {'User-Agent':'Mozilla/5.0' }
self.routes = [
{'path':'^/$' ,'url':'community' ,'find':'ref="/c/[-a-z_]+?/','map':lambda a:a[8:-1]},
{'path':'^/([-a-z_]+)_videos$' ,'url':'c/%s_videos/videos' ,'find':'sample-video=".*?"','map':lambda a:urllib.quote('http:'+a[14:-1], safe='')},
{'path':'^/([-a-z_]+)$' ,'url':'c/%s/albums' ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
{'path':'^/.*?/([-a-z_0-9]+_\d+)$' ,'url':'c/-/pictures/album/%s' ,'find':'src="//[ic].+?"' ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
{'path':'^/([-a-z_]+)/(\+[-+a-z_0-9]+)$' ,'url':'c/%s/albums/tagged/%s' ,'find':'/albums/.+?/view/' ,'map':lambda a:a[8:-6]},
{'path':'^/([-a-z_]+)/([-a-z_0-9]+)$' ,'url':'c/%s/pictures/frontpage/0/text/%s' ,'find':'src="//[ic].+?"' ,'map':lambda a:urllib.quote('http:'+a[ 5:-1], safe='')},
]
def fetch_thread(self,path,target,fh,attrs):
url = urllib.unquote(os.path.basename(path))
target[fh] = ""
try:
target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
attrs[path]['st_size']=len(target[fh])
try:
url = url.replace('.100x100','.315x0')
target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
attrs[path]['st_size']=len(target[fh])
except urllib2.HTTPError,e:
pass
url = url.replace('.315x0','')
target[fh] = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
attrs[path]['st_size']=len(target[fh])
except urllib2.HTTPError,e:
print "ERROR>>>",e,url,len(target[fh])
if len(target[fh])==0:
del target[fh]
def fetch(self, path, page=1):
for r in self.routes:
g = re.match(r['path'],path)
if not g:continue
url = 'http://'+self.root+'/'+r['url']%g.groups()+'/page/%d/'%page
try:
html = urllib2.urlopen(urllib2.Request(url,None,self.ua)).read()
link = map(r['map'],re.findall(r['find'],html))
return sorted(set(link), key=lambda x: link.index(x))
except urllib2.HTTPError,e:
print e,url
break
#STRUCTURE OPERATION
def getattr(self, path, fh=None):
#attrs = ('st_atime', 'st_ctime', 'st_gid', 'st_mode', 'st_mtime', 'st_nlink', 'st_size', 'st_uid')
if not self.cache.has_key(path):
return {'st_mode': 040666, 'st_size': 0, 'f':[], 'page':0}
return self.cache[path]
def readdir(self, path, fh):
#only handle visible files
if '.' not in path[:2]:
#send the 2 mandatory nodes
for dots in ['.', '..']:yield dots;
#the dir is on RAM : send it all
if path in self.cache and 'complete' in self.cache[path]:
for d in self.cache[path]['f']:yield d.strip("/")
else:#the dir doesn't exist or is unfinished
if path not in self.cache:
self.cache[path]={'st_mode': 040444, 'st_size': 0,'f':[],'page':0}
#the dir now exist, fill+yield it while we can and mark it as complete
for i in xrange(1,20):
self.cache[path]['page'] += 1
results = self.fetch(path,self.cache[path]['page']) or []
#print "FETCH>>>",path,self.cache[path]['page'],len(results)
if not results:break
for r in results:
self.cache[path]['f'].append(r)
self.cache[os.path.join(path,r)] = {'st_mode': 0100666 if r[:4]=='http' else 040666, 'st_size': 4, 'f':[], 'page':0}
yield r
self.cache[path]['complete']=True
#FILE OPERATION
def open(self, path, flags):
fh=hash(path)&0x0FFFFFFFL
if not fh in self.files:
threading.Thread(target = self.fetch_thread, args = (path,self.files,fh,self.cache)).start()
return fh
def read(self, path, length, offset, fh):
f = self.files.get(fh,'')
return f[offset:offset+length]
def release(self, path, fh):
#self.cache[path]['thread'].join(1)
#self.files[fh]=""
return 0
if __name__ == '__main__':
FUSE(MyWebsite(sys.argv[1]), sys.argv[2], nothreads=True, foreground=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment