Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created March 29, 2016 13:07
Show Gist options
  • Save nruigrok/1d33c6d28ca3c1fa4b16 to your computer and use it in GitHub Desktop.
Save nruigrok/1d33c6d28ca3c1fa4b16 to your computer and use it in GitHub Desktop.
from lxml import html, etree
import urllib
import csv
import sys
import locale, time, datetime
from amcatclient import AmcatAPI
import requests
import re
from urlparse import urljoin
from itertools import count
from locale import setlocale
def get_pages(url):
page = requests.get(url)
tree = html.fromstring(page.text)
cat = tree.cssselect('div#listCategories.hideonstart a')
for c in cat:
link = c.get('href')
if 'Accounting-Financieel' in link:
continue
if 'Administratief' in link:
continue
if 'Bouw' in link:
continue
yield link
def get_npages(tree):
pagination = tree.cssselect('a.box.afterSelected')
for p in pagination:
page=p.get('href')
yield page
def get_job_text(href):
page = requests.get(href)
tree = html.fromstring(page.text)
content = tree.cssselect('#bodycol')
if not content:
content = tree.cssselect('#CJT_container')
if not content:
content = tree.cssselect('#container')
if not content:
content = tree.cssselect('#content-container')
if not content:
content = tree.cssselect('#content-container1')
if not content:
content = tree.cssselect('.jobview-section')
if not content:
content = tree.cssselect('#monsterAppliesContentHolder')
if content:
inner = etree.tostring(content[0])
else:
print("Cannot find job text for {href}, using all HTML".format(**locals()))
inner = page.text
import html2text
text = html2text.html2text(inner)
return text
def scrape_job(tr):
headl = tr.cssselect('div.jobTitleContainer a')
headline = headl[0].text_content().strip()
text_ref = headl[0].get("href")+"#"
print text_ref
auth = tr.cssselect('div.companyContainer div a[href]')
author =auth[0].get("title").strip()
sect =tr.cssselect('div.jobLocationSingleLine a')
if sect:
section=sect[0].get("title")
else:
sect =tr.cssselect('div.jobLocationSingleLine')
section = sect[0].text_content().replace("Locatie:","").strip()
datestr = tr.cssselect('div.fnt20')
datestr = datestr[0].text_content().replace("Geplaatst:", "").strip()
vandaag = datetime.date.today()
if datestr == "Vandaag":
date = vandaag
elif "dagen geleden" in datestr:
m = re.match("(\d+) dagen geleden", datestr)
ndays = int(m.group(1))
date = vandaag - datetime.timedelta(days=ndays)
else:
raise Exception("Cannot parse "+datestr)
text = get_job_text(text_ref)
return {
"headline": headline,
"date": date,
"medium": "Monsterboard",
"author": author,
"text": text,
"url": text_ref,
}
def scrape_pages(page_url,i):
page_url = "{page_url}&pg={i}".format(**locals())
print ">> Scraping page", page_url
page = requests.get(page_url)
tree = html.fromstring(page.text)
if 'Pagina niet gevonden' in tree:
print ">>>> niet gevonden {page_url}".format(**locals())
return []
trs=list(tree.cssselect('table.listingsTable tr.odd, table.listingsTable tr.even'))
if not trs:
print "No jobs, retrying..."
for a in scrape_pages(page_url, i):
yield a
return
for tr in trs:
yield scrape_job(tr)
# get_link(link)
def get_link(link):
page = requests.get(link)
tree = html.fromstring(page.text)
print tree
conn = AmcatAPI("http://localhost:8000", "amcat","amcat")
url = 'http://monsterboard.nl'
pages = list(get_pages(url))
for page in pages:
for i in range(70,201):
if page =[]
break
print page, i
art=list(scrape_pages(page, i))
print "Uploading {} articles to amcat".format(len(art))
articles = conn.create_articles(project=1, articleset=32, json_data=art)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment