Skip to content

Instantly share code, notes, and snippets.

@oldsharp
Created July 17, 2015 09:12
Show Gist options
  • Save oldsharp/ce62c36717e0b1151963 to your computer and use it in GitHub Desktop.
Save oldsharp/ce62c36717e0b1151963 to your computer and use it in GitHub Desktop.
yet another stupid web crawler
#!/usr/bin/python2
# -*- coding: utf-8 -*-
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
URL = "http://dianying.taobao.com/cinemaDetail.htm?spm=a1z21.6646277.w2.303.cccQvy&cinemaId=13553&n_s=new"
RAW = ur".*7月18日(周六).*"
def main():
try:
browser = webdriver.Firefox()
browser.get(URL)
while True:
soup = BeautifulSoup(browser.page_source, "html.parser")
pretty_page_src = soup.prettify()
if match(pretty_page_src):
# Put notify logic here
print "Gotcha"
break
else:
print "Pity, refreshing..."
time.sleep(5)
browser.refresh()
except KeyboardInterrupt:
pass
finally:
browser.quit()
def match(text):
pattern = re.compile(RAW)
for line in text.splitlines():
if pattern.search(line):
return True
return False
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment