Skip to content

Instantly share code, notes, and snippets.

@youngershen
Created April 19, 2012 06:21
Show Gist options
  • Save youngershen/2419086 to your computer and use it in GitHub Desktop.
Save youngershen/2419086 to your computer and use it in GitHub Desktop.
your sister 's qvod link spider
import re
import urllib
import string
def parseUrl(baseUrl):
urlContent = []
f = urllib.urlopen(baseUrl)
buffer = ''
for line in f.read():
buffer += line
urlContent = re.findall(r'([\w-]+\.)+[/w-]+.([^a-z])(/[\w-: ./?%&=]*)?|[a-zA-Z\-\.][\w-]+.([^a-z])(/[\w-: ./?%&=]*)?',buffer)
print urlContent
return urlContent
def parseRegex(str):
print 's'
result = re.findall(r'qvod://[0-9]{9}\|[A-Z0-9]{40}\|[\x80-\xff0-9]+\.\w+',str)
return result
def parseHtml(url):
f = urllib.urlopen(url)
buffer = ''
for line in f.read():
buffer+=line
result = parseRegex(buffer)
print result
def doHandler():
mainUrl = 'http://www.qvodzy.me/index.asp?page='
for i in range(1,100):
url = mainUrl+str(i)
print 'url is--'+url
parseHtml(url)
if __name__ == '__main__':
#parseHtml('s')
#doHandler()
parseUrl('http://www.qvodzy.me')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment