Created
January 29, 2016 11:08
-
-
Save nruigrok/bff633ceb7022778575b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from amcatclient import AmcatAPI | |
import requests | |
import re | |
from urlparse import urljoin | |
from locale import setlocale | |
#setlocale(locale.LC_ALL, "nl_NL.utf8") | |
MONTHS = {"okt": "oct", "maa": "mar", "mei": "may"} | |
#>>> for page in range(10, 130, 10): | |
#... print page | |
def get_fora(url): | |
page = requests.get(url) | |
if "Je hebt niet de benodigde permissies om onderwerpen in dit forum te lezen." in page.text: | |
return | |
tree = html.fromstring(page.text) | |
l = tree.cssselect('h4 a.forumtitle') | |
for ls in l: | |
link = urljoin(url, ls.get('href')) | |
yield link | |
subfora = get_fora(link) | |
for subforum in subfora: | |
yield subforum | |
def get_npages(tree): | |
pagination = tree.cssselect('div.pagination')[0] | |
ul = pagination.cssselect('ul') | |
jump = pagination.cssselect('div.page-jump') | |
if jump: | |
paginas=jump[0].tail | |
elif ul: | |
paginas=ul[0].tail | |
else: | |
paginas=pagination.text_content() | |
m = re.match(r"([0-9]+) (onderwerpe?n?|berichte?n?)$", paginas.strip()) | |
if not m: | |
raise Exception("Could not parse {paginas!r}".format(**locals())) | |
n = int(m.group(1)) | |
return n | |
def scrape_forum(forum_url): | |
print ">> Scraping forum", forum_url | |
page = requests.get(forum_url) | |
tree = html.fromstring(page.text) | |
n = get_npages(tree) | |
for page in range(0, n, 25): | |
scrape_page(forum_url, page) | |
def scrape_page(forum_url, page): | |
print ">>> Scraping forum page", page | |
url = "{forum_url}&start={page}".format(**locals()) | |
page = requests.get(url) | |
tree = html.fromstring(page.text) | |
l = tree.cssselect('h4 a.topictitle') | |
for ls in l: | |
scrape_thread(urljoin(url, ls.get('href'))) | |
def scrape_thread(url): | |
print ">>>> Scraping thread", url | |
page = requests.get(url) | |
tree = html.fromstring(page.text) | |
n = get_npages(tree) | |
articles = [] | |
for page in range(0, n, 10): | |
articles += scrape_thread_page(url, page) | |
print ">>>> Creating {nposts} posts in thread {url}".format(nposts=len(articles), **locals()) | |
if articles: | |
articles[0]['children'] = articles[1:] | |
articles = [articles[0]] | |
articles = conn.create_articles(project=project_id, articleset=articleset_id, json_data=articles) | |
def scrape_thread_page(url, page): | |
url = "{url}&start={page}".format(**locals()) | |
page = requests.get(url) | |
tree = html.fromstring(page.text) | |
crumbs = tree.cssselect("ul#breadcrumbs a") | |
section = " / ".join([crumb.text_content() for crumb in crumbs]) | |
result = [] | |
for post in tree.cssselect("div.post"): | |
a = create_art(post) | |
if a is not None: | |
a['section'] = section | |
result.append(a) | |
return result | |
def create_art(post): | |
headl = post.cssselect('article.postbody h3') | |
txt = post.cssselect('div.post-content') | |
auth = post.cssselect('a.username') | |
if not auth: | |
auth = post.cssselect('a.username-coloured') | |
d = post.cssselect('li.right.post-date') | |
if not d: | |
return | |
headline = headl[0].text_content().strip() | |
text = etree.tostring(txt[0]) | |
text = re.sub("<br/?>|<p/?>", "\n\n", text) | |
text = re.sub("<[^>]+?>", " ", text) | |
text = re.sub(" +", " ", text) | |
text = re.sub("[\n\s][\n\s]+", "\n\n", text) | |
text = text.strip() | |
if not text: | |
text = "-" | |
author = auth[0].text_content().strip() | |
date = d[0].text_content().strip() | |
date = date.lower() | |
for nl, en in MONTHS.items(): | |
date = date.replace(nl, en) | |
date2=time.strptime(date,"%d %b %Y, %H:%M") | |
date_iso = time.strftime('%Y-%m-%dT%H:%M', date2) | |
return { | |
"headline": headline, | |
"date": date_iso, | |
"medium": "Postbodeforum", | |
"author": author, | |
"text": text, | |
} | |
conn = AmcatAPI("http://localhost:8000", "amcat","amcat") | |
project_id = 1 | |
articleset_id = 21 | |
url = 'http://postbezorgers.org/forum/index.php' | |
fora = list(get_fora(url)) | |
for forum in fora: | |
scrape_forum(forum) | |
sys.exit() | |
arts = [] | |
for post in links4: | |
print post | |
art = create_art(post) | |
if art: | |
arts.append(art) | |
arts = [create_art(posts) for posts in links4] | |
articles = conn.create_articles(project=1, articleset=11, json_data=arts) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment