nruigrok/gist:bff633ceb7022778575b

## gistfile1.txt
from amcatclient import AmcatAPI
import requests
import re
from urlparse import urljoin

from locale import setlocale

#setlocale(locale.LC_ALL, "nl_NL.utf8")

MONTHS = {"okt": "oct", "maa": "mar", "mei": "may"}

#>>> for page in range(10, 130, 10):
#...   print page


def get_fora(url):
    page = requests.get(url)
    if "Je hebt niet de benodigde permissies om onderwerpen in dit forum te lezen." in page.text:
        return
    tree = html.fromstring(page.text)
    l = tree.cssselect('h4 a.forumtitle')
    for ls in l:
        link = urljoin(url, ls.get('href'))
        yield link

        subfora = get_fora(link)
        for subforum in subfora:
            yield subforum

def get_npages(tree):
    pagination = tree.cssselect('div.pagination')[0]
    ul = pagination.cssselect('ul')
    jump = pagination.cssselect('div.page-jump')
    if jump:
        paginas=jump[0].tail
    elif ul:
        paginas=ul[0].tail
    else:
        paginas=pagination.text_content()

    m = re.match(r"([0-9]+) (onderwerpe?n?|berichte?n?)$", paginas.strip())
    if not m:
        raise Exception("Could not parse {paginas!r}".format(**locals()))

    n = int(m.group(1))
    return n


def scrape_forum(forum_url):
    print ">> Scraping forum", forum_url
    page = requests.get(forum_url)
    tree = html.fromstring(page.text)
    n = get_npages(tree)
    for page in range(0, n, 25):
        scrape_page(forum_url, page)

def scrape_page(forum_url, page):
    print ">>> Scraping forum page", page
    url = "{forum_url}&start={page}".format(**locals())
    page = requests.get(url)
    tree = html.fromstring(page.text)
    l = tree.cssselect('h4 a.topictitle')
    for ls in l:
        scrape_thread(urljoin(url, ls.get('href')))


def scrape_thread(url):
    print ">>>> Scraping thread", url
    page = requests.get(url)
    tree = html.fromstring(page.text)
    n = get_npages(tree)
    articles = []
    for page in range(0, n, 10):
        articles += scrape_thread_page(url, page)

    print ">>>> Creating {nposts} posts in thread {url}".format(nposts=len(articles), **locals())
    if articles:
        articles[0]['children'] = articles[1:]
        articles = [articles[0]]

        articles = conn.create_articles(project=project_id, articleset=articleset_id, json_data=articles)

def scrape_thread_page(url, page):
    url = "{url}&start={page}".format(**locals())
    page = requests.get(url)
    tree = html.fromstring(page.text)

    crumbs = tree.cssselect("ul#breadcrumbs a")
    section = " / ".join([crumb.text_content() for crumb in crumbs])

    result = []
    for post in tree.cssselect("div.post"):
        a = create_art(post)
        if a is not None:
            a['section'] = section
            result.append(a)
    return result

def create_art(post):
    headl = post.cssselect('article.postbody h3')
    txt = post.cssselect('div.post-content')
    auth = post.cssselect('a.username')
    if not auth:
        auth = post.cssselect('a.username-coloured')
    d = post.cssselect('li.right.post-date')
    if not d:
        return

    headline = headl[0].text_content().strip()

    text = etree.tostring(txt[0])
    text = re.sub("<br/?>|<p/?>", "\n\n", text)
    text = re.sub("<[^>]+?>", " ", text)
    text = re.sub("  +", " ", text)
    text = re.sub("[\n\s][\n\s]+", "\n\n", text)
    text = text.strip()

    if not text:
        text = "-"

    author = auth[0].text_content().strip()
    date = d[0].text_content().strip()
    date = date.lower()
    for nl, en in MONTHS.items():
        date = date.replace(nl, en)

    date2=time.strptime(date,"%d %b %Y, %H:%M")
    date_iso = time.strftime('%Y-%m-%dT%H:%M', date2)
    return {
        "headline": headline,
        "date": date_iso,
        "medium": "Postbodeforum",
        "author": author,
        "text": text,
    }


conn = AmcatAPI("http://localhost:8000", "amcat","amcat")
project_id = 1
articleset_id = 21

url = 'http://postbezorgers.org/forum/index.php'
fora = list(get_fora(url))

for forum in fora:
    scrape_forum(forum)


sys.exit()

arts = []
for post in links4:
    print post
    art = create_art(post)
    if art:
        arts.append(art)


arts = [create_art(posts) for posts in links4]

articles = conn.create_articles(project=1, articleset=11, json_data=arts)
	from amcatclient import AmcatAPI
	import requests
	import re
	from urlparse import urljoin

	from locale import setlocale

	#setlocale(locale.LC_ALL, "nl_NL.utf8")

	MONTHS = {"okt": "oct", "maa": "mar", "mei": "may"}

	#>>> for page in range(10, 130, 10):
	#... print page



	def get_fora(url):
	page = requests.get(url)
	if "Je hebt niet de benodigde permissies om onderwerpen in dit forum te lezen." in page.text:
	return
	tree = html.fromstring(page.text)
	l = tree.cssselect('h4 a.forumtitle')
	for ls in l:
	link = urljoin(url, ls.get('href'))
	yield link

	subfora = get_fora(link)
	for subforum in subfora:
	yield subforum

	def get_npages(tree):
	pagination = tree.cssselect('div.pagination')[0]
	ul = pagination.cssselect('ul')
	jump = pagination.cssselect('div.page-jump')
	if jump:
	paginas=jump[0].tail
	elif ul:
	paginas=ul[0].tail
	else:
	paginas=pagination.text_content()

	m = re.match(r"([0-9]+) (onderwerpe?n?\|berichte?n?)$", paginas.strip())
	if not m:
	raise Exception("Could not parse {paginas!r}".format(**locals()))

	n = int(m.group(1))
	return n


	def scrape_forum(forum_url):
	print ">> Scraping forum", forum_url
	page = requests.get(forum_url)
	tree = html.fromstring(page.text)
	n = get_npages(tree)
	for page in range(0, n, 25):
	scrape_page(forum_url, page)

	def scrape_page(forum_url, page):
	print ">>> Scraping forum page", page
	url = "{forum_url}&start={page}".format(**locals())
	page = requests.get(url)
	tree = html.fromstring(page.text)
	l = tree.cssselect('h4 a.topictitle')
	for ls in l:
	scrape_thread(urljoin(url, ls.get('href')))


	def scrape_thread(url):
	print ">>>> Scraping thread", url
	page = requests.get(url)
	tree = html.fromstring(page.text)
	n = get_npages(tree)
	articles = []
	for page in range(0, n, 10):
	articles += scrape_thread_page(url, page)

	print ">>>> Creating {nposts} posts in thread {url}".format(nposts=len(articles), **locals())
	if articles:
	articles[0]['children'] = articles[1:]
	articles = [articles[0]]

	articles = conn.create_articles(project=project_id, articleset=articleset_id, json_data=articles)

	def scrape_thread_page(url, page):
	url = "{url}&start={page}".format(**locals())
	page = requests.get(url)
	tree = html.fromstring(page.text)

	crumbs = tree.cssselect("ul#breadcrumbs a")
	section = " / ".join([crumb.text_content() for crumb in crumbs])

	result = []
	for post in tree.cssselect("div.post"):
	a = create_art(post)
	if a is not None:
	a['section'] = section
	result.append(a)
	return result

	def create_art(post):
	headl = post.cssselect('article.postbody h3')
	txt = post.cssselect('div.post-content')
	auth = post.cssselect('a.username')
	if not auth:
	auth = post.cssselect('a.username-coloured')
	d = post.cssselect('li.right.post-date')
	if not d:
	return

	headline = headl[0].text_content().strip()

	text = etree.tostring(txt[0])
	text = re.sub("<br/?>\|<p/?>", "\n\n", text)
	text = re.sub("<[^>]+?>", " ", text)
	text = re.sub(" +", " ", text)
	text = re.sub("[\n\s][\n\s]+", "\n\n", text)
	text = text.strip()

	if not text:
	text = "-"

	author = auth[0].text_content().strip()
	date = d[0].text_content().strip()
	date = date.lower()
	for nl, en in MONTHS.items():
	date = date.replace(nl, en)

	date2=time.strptime(date,"%d %b %Y, %H:%M")
	date_iso = time.strftime('%Y-%m-%dT%H:%M', date2)
	return {
	"headline": headline,
	"date": date_iso,
	"medium": "Postbodeforum",
	"author": author,
	"text": text,
	}


	conn = AmcatAPI("http://localhost:8000", "amcat","amcat")
	project_id = 1
	articleset_id = 21

	url = 'http://postbezorgers.org/forum/index.php'
	fora = list(get_fora(url))

	for forum in fora:
	scrape_forum(forum)




	sys.exit()

	arts = []
	for post in links4:
	print post
	art = create_art(post)
	if art:
	arts.append(art)



	arts = [create_art(posts) for posts in links4]

	articles = conn.create_articles(project=1, articleset=11, json_data=arts)