Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created January 29, 2016 11:08
Show Gist options
  • Save nruigrok/bff633ceb7022778575b to your computer and use it in GitHub Desktop.
Save nruigrok/bff633ceb7022778575b to your computer and use it in GitHub Desktop.
from amcatclient import AmcatAPI
import requests
import re
from urlparse import urljoin
from locale import setlocale
#setlocale(locale.LC_ALL, "nl_NL.utf8")
MONTHS = {"okt": "oct", "maa": "mar", "mei": "may"}
#>>> for page in range(10, 130, 10):
#... print page
def get_fora(url):
page = requests.get(url)
if "Je hebt niet de benodigde permissies om onderwerpen in dit forum te lezen." in page.text:
return
tree = html.fromstring(page.text)
l = tree.cssselect('h4 a.forumtitle')
for ls in l:
link = urljoin(url, ls.get('href'))
yield link
subfora = get_fora(link)
for subforum in subfora:
yield subforum
def get_npages(tree):
pagination = tree.cssselect('div.pagination')[0]
ul = pagination.cssselect('ul')
jump = pagination.cssselect('div.page-jump')
if jump:
paginas=jump[0].tail
elif ul:
paginas=ul[0].tail
else:
paginas=pagination.text_content()
m = re.match(r"([0-9]+) (onderwerpe?n?|berichte?n?)$", paginas.strip())
if not m:
raise Exception("Could not parse {paginas!r}".format(**locals()))
n = int(m.group(1))
return n
def scrape_forum(forum_url):
print ">> Scraping forum", forum_url
page = requests.get(forum_url)
tree = html.fromstring(page.text)
n = get_npages(tree)
for page in range(0, n, 25):
scrape_page(forum_url, page)
def scrape_page(forum_url, page):
print ">>> Scraping forum page", page
url = "{forum_url}&start={page}".format(**locals())
page = requests.get(url)
tree = html.fromstring(page.text)
l = tree.cssselect('h4 a.topictitle')
for ls in l:
scrape_thread(urljoin(url, ls.get('href')))
def scrape_thread(url):
print ">>>> Scraping thread", url
page = requests.get(url)
tree = html.fromstring(page.text)
n = get_npages(tree)
articles = []
for page in range(0, n, 10):
articles += scrape_thread_page(url, page)
print ">>>> Creating {nposts} posts in thread {url}".format(nposts=len(articles), **locals())
if articles:
articles[0]['children'] = articles[1:]
articles = [articles[0]]
articles = conn.create_articles(project=project_id, articleset=articleset_id, json_data=articles)
def scrape_thread_page(url, page):
url = "{url}&start={page}".format(**locals())
page = requests.get(url)
tree = html.fromstring(page.text)
crumbs = tree.cssselect("ul#breadcrumbs a")
section = " / ".join([crumb.text_content() for crumb in crumbs])
result = []
for post in tree.cssselect("div.post"):
a = create_art(post)
if a is not None:
a['section'] = section
result.append(a)
return result
def create_art(post):
headl = post.cssselect('article.postbody h3')
txt = post.cssselect('div.post-content')
auth = post.cssselect('a.username')
if not auth:
auth = post.cssselect('a.username-coloured')
d = post.cssselect('li.right.post-date')
if not d:
return
headline = headl[0].text_content().strip()
text = etree.tostring(txt[0])
text = re.sub("<br/?>|<p/?>", "\n\n", text)
text = re.sub("<[^>]+?>", " ", text)
text = re.sub(" +", " ", text)
text = re.sub("[\n\s][\n\s]+", "\n\n", text)
text = text.strip()
if not text:
text = "-"
author = auth[0].text_content().strip()
date = d[0].text_content().strip()
date = date.lower()
for nl, en in MONTHS.items():
date = date.replace(nl, en)
date2=time.strptime(date,"%d %b %Y, %H:%M")
date_iso = time.strftime('%Y-%m-%dT%H:%M', date2)
return {
"headline": headline,
"date": date_iso,
"medium": "Postbodeforum",
"author": author,
"text": text,
}
conn = AmcatAPI("http://localhost:8000", "amcat","amcat")
project_id = 1
articleset_id = 21
url = 'http://postbezorgers.org/forum/index.php'
fora = list(get_fora(url))
for forum in fora:
scrape_forum(forum)
sys.exit()
arts = []
for post in links4:
print post
art = create_art(post)
if art:
arts.append(art)
arts = [create_art(posts) for posts in links4]
articles = conn.create_articles(project=1, articleset=11, json_data=arts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment