nruigrok/jancis

## jancis
from lxml import html
import requests
import csv
import sys
import json
from amcatclient import AmcatAPI
import urllib
from lxml.html import fromstring
import time
from urllib.request import Request, urlopen


#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


def gettree(s, url):
    r2 = s.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
    while r2.status_code==429:
        print("429 reached, waiting one minute")
        time.sleep(60)
        r2 = s.get(url)
    while r2.status_code == 403:
        print("403 reached, waiting one minute")
        time.sleep(10)
        r2 = s.get(url)
    if r2.status_code !=200:
        raise Exception("status code {},url:{}".format(r2.status_code, url))

    open("/tmp/test.html","w").write(r2.text)
    tree = html.fromstring(r2.text)
    tree.make_links_absolute(url)
    return tree

def wijnartikel(url):
    print(url)
    regtree = gettree(s,url)
    wijntxt = regtree.cssselect('div.main-body-text p')
    if not wijntxt:
        wijntxt =regtree.cssselect('div.main-body-text span')
        if not wijntxt:
            inhoud = "(no text)"
    for wtxt in wijntxt:
        inhoud = wtxt.text_content().strip()
        if not inhoud:
            inhoud = "(no text)"
    wdict={}
    k = regtree.cssselect('table.learn-table tr')
    for item in k:
        tds = item.cssselect("td")
        key = tds[0].text_content().strip()
        value = tds[1].text_content().strip()
        wdict[key] = value
    producer = wdict.get("Producer")
    appellation = wdict.get("Appellation")
    datum = "%s-01-01T00:00" % wdict['Vintage'].replace("NV","1900").replace("??","1900").replace("70/71","70")
    art = {"title":"%s : %s" %(appellation, producer),
           "text":inhoud,
           "date":datum,
           "author":wdict['Reviewer'],
           "url":url}
    if "Country" in wdict:
        art["Country"]=wdict["Country"]
    if 'Alcohol' in wdict and wdict['Alcohol'] != '':
        art["Alcohol_num"]=int(float(wdict['Alcohol'].replace("%","")) * 10)
    if 'Score' in wdict and wdict['Score'] != '?':
        art["Score_num"] = int(float(wdict.get('Score','0').replace("??","0").replace("?","").replace(",",".").replace("_","").replace("!","").replace(".%","").replace("+","").replace("-","")) * 10)
    if "When to drink" in wdict:
        art["Whentodrink"]= wdict['When to drink']
    return art


conn = AmcatAPI("https://amcat.nl", "nel","S!nterklaas")
#conn = AmcatAPI("http://localhost:8000", "amcat","amcat")

s = requests.Session()
respons=s.get("http://www.jancisrobinson.com")
respons=s.post("http://www.jancisrobinson.com", data={"username": "nelruigrok@nieuwsmonitor.org", "password": "sinterklaas"})

p=range(1,500)
#http://www.jancisrobinson.com/tastings/view/568013?from_search%5Bperpage%5D=100&from_search%5Bpage%5D=51&search_position=5062
#url="http://www.jancisrobinson.com/tastings/view/568006?from_search%5Bperpage%5D=100&from_search%5Bpage%5D=51&search_position=5063"
#wijnartikel(url)
#sys.exit()

for pag in p:
    tree = gettree(s,"http://www.jancisrobinson.com/tastings/search?perpage=100&page=%s" %(pag))
    print(tree)
    a = tree.cssselect("h2")
    print(a)
    w =[]
    for art in a:
        wijn = wijnartikel(art.get("href"))
        print(wijn)
        w.append(wijn)
        print(wijn)
        articles = conn.create_articles(project=1039, articleset=73420, json_data=[wijn])
	from lxml import html
	import requests
	import csv
	import sys
	import json
	from amcatclient import AmcatAPI
	import urllib
	from lxml.html import fromstring
	import time
	from urllib.request import Request, urlopen


	#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


	def gettree(s, url):
	r2 = s.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
	while r2.status_code==429:
	print("429 reached, waiting one minute")
	time.sleep(60)
	r2 = s.get(url)
	while r2.status_code == 403:
	print("403 reached, waiting one minute")
	time.sleep(10)
	r2 = s.get(url)
	if r2.status_code !=200:
	raise Exception("status code {},url:{}".format(r2.status_code, url))

	open("/tmp/test.html","w").write(r2.text)
	tree = html.fromstring(r2.text)
	tree.make_links_absolute(url)
	return tree

	def wijnartikel(url):
	print(url)
	regtree = gettree(s,url)
	wijntxt = regtree.cssselect('div.main-body-text p')
	if not wijntxt:
	wijntxt =regtree.cssselect('div.main-body-text span')
	if not wijntxt:
	inhoud = "(no text)"
	for wtxt in wijntxt:
	inhoud = wtxt.text_content().strip()
	if not inhoud:
	inhoud = "(no text)"
	wdict={}
	k = regtree.cssselect('table.learn-table tr')
	for item in k:
	tds = item.cssselect("td")
	key = tds[0].text_content().strip()
	value = tds[1].text_content().strip()
	wdict[key] = value
	producer = wdict.get("Producer")
	appellation = wdict.get("Appellation")
	datum = "%s-01-01T00:00" % wdict['Vintage'].replace("NV","1900").replace("??","1900").replace("70/71","70")
	art = {"title":"%s : %s" %(appellation, producer),
	"text":inhoud,
	"date":datum,
	"author":wdict['Reviewer'],
	"url":url}
	if "Country" in wdict:
	art["Country"]=wdict["Country"]
	if 'Alcohol' in wdict and wdict['Alcohol'] != '':
	art["Alcohol_num"]=int(float(wdict['Alcohol'].replace("%","")) * 10)
	if 'Score' in wdict and wdict['Score'] != '?':
	art["Score_num"] = int(float(wdict.get('Score','0').replace("??","0").replace("?","").replace(",",".").replace("_","").replace("!","").replace(".%","").replace("+","").replace("-","")) * 10)
	if "When to drink" in wdict:
	art["Whentodrink"]= wdict['When to drink']
	return art


	conn = AmcatAPI("https://amcat.nl", "nel","S!nterklaas")
	#conn = AmcatAPI("http://localhost:8000", "amcat","amcat")

	s = requests.Session()
	respons=s.get("http://www.jancisrobinson.com")
	respons=s.post("http://www.jancisrobinson.com", data={"username": "nelruigrok@nieuwsmonitor.org", "password": "sinterklaas"})

	p=range(1,500)
	#http://www.jancisrobinson.com/tastings/view/568013?from_search%5Bperpage%5D=100&from_search%5Bpage%5D=51&search_position=5062
	#url="http://www.jancisrobinson.com/tastings/view/568006?from_search%5Bperpage%5D=100&from_search%5Bpage%5D=51&search_position=5063"
	#wijnartikel(url)
	#sys.exit()

	for pag in p:
	tree = gettree(s,"http://www.jancisrobinson.com/tastings/search?perpage=100&page=%s" %(pag))
	print(tree)
	a = tree.cssselect("h2")
	print(a)
	w =[]
	for art in a:
	wijn = wijnartikel(art.get("href"))
	print(wijn)
	w.append(wijn)
	print(wijn)
	articles = conn.create_articles(project=1039, articleset=73420, json_data=[wijn])