Skip to content

Instantly share code, notes, and snippets.

@mynameisfiber
Created July 29, 2016 22:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mynameisfiber/4523c58e0a63479eef646405c4b6d63f to your computer and use it in GitHub Desktop.
Save mynameisfiber/4523c58e0a63479eef646405c4b6d63f to your computer and use it in GitHub Desktop.
Extract ingress codes from the blog
from lxml import html
import requests
import re
import pickle
def get_codes(dom):
codes = set()
codes.update(_get_codes_urls(dom))
codes.update(_get_codes_alt_id(dom))
codes.update(_get_codes_span(dom))
return codes
def filter_codes(func):
blacklist = ('menu', 'post', 'text', 'poll', 'calendar', 'blog', 'search',
'image', 'social')
is_code = re.compile("^(?=\S*[a-z])(?=\S*[0-9])\S+$")
def _(*args, **kwargs):
for code_candidate in func(*args, **kwargs):
if is_code.match(code_candidate) is not None and \
not any(b in code_candidate for b in blacklist):
yield code_candidate
return _
@filter_codes
def _get_codes_urls(dom):
for url in dom.xpath(".//a/@href"):
if "#" in url:
_, code_candidate = url.rsplit("#", 1)
yield code_candidate
@filter_codes
def _get_codes_alt_id(dom):
yield from dom.xpath(".//@alt|.//@id")
@filter_codes
def _get_codes_span(dom):
yield from dom.xpath(".//span/text()")
if __name__ == "__main__":
data = requests.get("http://investigate.ingress.com/")
dom = html.fromstring(data.content)
codes = get_codes(dom)
try:
with open("codes_history.pkl", 'rb') as fd:
previous_codes = pickle.load(fd)
except:
previous_codes = set()
with open("codes_history.pkl", 'wb+') as fd:
pickle.dump(codes | previous_codes, fd)
new_codes = codes - previous_codes
print("\n".join(new_codes) or "No New Codes")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment