Created
March 29, 2016 13:07
-
-
Save nruigrok/1d33c6d28ca3c1fa4b16 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html, etree | |
import urllib | |
import csv | |
import sys | |
import locale, time, datetime | |
from amcatclient import AmcatAPI | |
import requests | |
import re | |
from urlparse import urljoin | |
from itertools import count | |
from locale import setlocale | |
def get_pages(url): | |
page = requests.get(url) | |
tree = html.fromstring(page.text) | |
cat = tree.cssselect('div#listCategories.hideonstart a') | |
for c in cat: | |
link = c.get('href') | |
if 'Accounting-Financieel' in link: | |
continue | |
if 'Administratief' in link: | |
continue | |
if 'Bouw' in link: | |
continue | |
yield link | |
def get_npages(tree): | |
pagination = tree.cssselect('a.box.afterSelected') | |
for p in pagination: | |
page=p.get('href') | |
yield page | |
def get_job_text(href): | |
page = requests.get(href) | |
tree = html.fromstring(page.text) | |
content = tree.cssselect('#bodycol') | |
if not content: | |
content = tree.cssselect('#CJT_container') | |
if not content: | |
content = tree.cssselect('#container') | |
if not content: | |
content = tree.cssselect('#content-container') | |
if not content: | |
content = tree.cssselect('#content-container1') | |
if not content: | |
content = tree.cssselect('.jobview-section') | |
if not content: | |
content = tree.cssselect('#monsterAppliesContentHolder') | |
if content: | |
inner = etree.tostring(content[0]) | |
else: | |
print("Cannot find job text for {href}, using all HTML".format(**locals())) | |
inner = page.text | |
import html2text | |
text = html2text.html2text(inner) | |
return text | |
def scrape_job(tr): | |
headl = tr.cssselect('div.jobTitleContainer a') | |
headline = headl[0].text_content().strip() | |
text_ref = headl[0].get("href")+"#" | |
print text_ref | |
auth = tr.cssselect('div.companyContainer div a[href]') | |
author =auth[0].get("title").strip() | |
sect =tr.cssselect('div.jobLocationSingleLine a') | |
if sect: | |
section=sect[0].get("title") | |
else: | |
sect =tr.cssselect('div.jobLocationSingleLine') | |
section = sect[0].text_content().replace("Locatie:","").strip() | |
datestr = tr.cssselect('div.fnt20') | |
datestr = datestr[0].text_content().replace("Geplaatst:", "").strip() | |
vandaag = datetime.date.today() | |
if datestr == "Vandaag": | |
date = vandaag | |
elif "dagen geleden" in datestr: | |
m = re.match("(\d+) dagen geleden", datestr) | |
ndays = int(m.group(1)) | |
date = vandaag - datetime.timedelta(days=ndays) | |
else: | |
raise Exception("Cannot parse "+datestr) | |
text = get_job_text(text_ref) | |
return { | |
"headline": headline, | |
"date": date, | |
"medium": "Monsterboard", | |
"author": author, | |
"text": text, | |
"url": text_ref, | |
} | |
def scrape_pages(page_url,i): | |
page_url = "{page_url}&pg={i}".format(**locals()) | |
print ">> Scraping page", page_url | |
page = requests.get(page_url) | |
tree = html.fromstring(page.text) | |
if 'Pagina niet gevonden' in tree: | |
print ">>>> niet gevonden {page_url}".format(**locals()) | |
return [] | |
trs=list(tree.cssselect('table.listingsTable tr.odd, table.listingsTable tr.even')) | |
if not trs: | |
print "No jobs, retrying..." | |
for a in scrape_pages(page_url, i): | |
yield a | |
return | |
for tr in trs: | |
yield scrape_job(tr) | |
# get_link(link) | |
def get_link(link): | |
page = requests.get(link) | |
tree = html.fromstring(page.text) | |
print tree | |
conn = AmcatAPI("http://localhost:8000", "amcat","amcat") | |
url = 'http://monsterboard.nl' | |
pages = list(get_pages(url)) | |
for page in pages: | |
for i in range(70,201): | |
if page =[] | |
break | |
print page, i | |
art=list(scrape_pages(page, i)) | |
print "Uploading {} articles to amcat".format(len(art)) | |
articles = conn.create_articles(project=1, articleset=32, json_data=art) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment