Skip to content

Instantly share code, notes, and snippets.

@sente
Created February 21, 2018 19:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sente/9b5eeee20faa33924022f257892a4738 to your computer and use it in GitHub Desktop.
Save sente/9b5eeee20faa33924022f257892a4738 to your computer and use it in GitHub Desktop.
import requests
import re
import glob
import os
import sys
import pprint
import pandas
import lxml
import lxml.html
def get_link(symbol):
import requests
url = "http://www.un.org/en/ga/search/doc_top.asp?symbol={}&Lang=E&referer=/english/".format(symbol)
res = requests.get(url)
if res.status_code == 200:
root = lxml.html.fromstring(res.text)
links = [a for a in root.xpath('//a') if a.get('href','').endswith('Type=DOC')]
for l in links:
return l.get('href')
def getpage1(page):
if page.startswith('http://') or page.startswith('https://'):
foo = lxml.html.parse(page)
else:
if os.path.isfile(page):
foo = lxml.html.fromstring(open(page,'r').read())
foo.make_links_absolute(base_url='http://www.un.org/en/sc/')
ret = []
trs = foo.xpath('//table/tr')
for tr in trs:
row = {}
td_vals = [td.text_content().strip() for td in tr.xpath('.//td')]
td_vals = [re.sub('\s+',' ', s) for s in td_vals]
td_links = [a.get('href','') for a in tr.xpath('.//*/a')]
row['links'] = td_links
row['vals'] = td_vals
#ret.append([td_vals,td_links])
if row:
ret.append(row)
return ret
def getpage(page):
if page.startswith('http://') or page.startswith('https://'):
foo = lxml.html.parse(page)
else:
if os.path.isfile(page):
foo = lxml.html.fromstring(open(page,'r').read())
trs = {}
for td in foo.xpath('//table/tr/td'):
tr = td.getparent()
if tr not in trs:
trs[tr] = [td]
else:
trs[tr].append(td)
res = []
for tr,tds in trs.items():
rdic = {}
for i,td in enumerate(tds):
print (i, td.text_content())
anchors = td.xpath(".//a")
if anchors:
for anchor in anchors:
rdic['linktext'] = anchor.text_content()
rdic['linkhref'] = anchor.get("href",'')
else:
rdic['tdtext'] = td.text_content()
#print(td.text)
#desctest = td1.text
#res.append([linktext,linkhref,desctest])
print ("\n\n\n")
res.append(rdic)
#print(pprint.pformat(res,width=200))
return res
def main():
args = sys.argv[1:]
results = {}
for arg in args:
results[arg] = getpage(arg)
return results
if __name__ == '__main__':
files = [l.strip() for l in open('women.txt','r').readlines() if l.strip()]
ar = []
for f in files:
print(f)
for res in getpage1(f):
for v in res['vals']:
if 'Women' in v:
if res['links']:
symbol = res['links'][0].split('symbol=')[1]
doc_page = get_link(symbol)
print(doc_page)
r = requests.get(doc_page)
fname = r.history[-1].url.split('/')[-1]
open('docs/' + fname,'wb').write(r.content)
res['file'] = fname
res['ref'] = 'http://www.un.org/en/sc/documents/' + f
ar.append(res)
import tablib
ds = tablib.Dataset()
ds.dict = ar
open('ds.csv','w').write(ds.csv)
#print(res['links'])
# glob.glob('resolutions/20??.shtml') + glob.glob('statements/20??.shtml')
# files = glob.glob('resolutions/20??.shtml') + glob.glob('statements/20??.shtml')
# fs = []
# for f in files:
# f = getpage1(files[3])
# fs.append(f)
# print(f)
#results = main()
resolutions/2000.shtml
resolutions/2008.shtml
resolutions/2009.shtml
resolutions/2010.shtml
resolutions/2013.shtml
resolutions/2015.shtml
statements/2001.shtml
statements/2002.shtml
statements/2004.shtml
statements/2005.shtml
statements/2006.shtml
statements/2007.shtml
statements/2008.shtml
statements/2010.shtml
statements/2011.shtml
statements/2012.shtml
statements/2014.shtml
statements/2016.shtml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment