Skip to content

Instantly share code, notes, and snippets.

@vadimii
Created May 7, 2012 05:54
Show Gist options
  • Save vadimii/2626164 to your computer and use it in GitHub Desktop.
Save vadimii/2626164 to your computer and use it in GitHub Desktop.
Extract domain structure of lomonosov-fund.ru
# coding=utf-8
import urllib2
import string
import os.path
from lxml.html import fromstring
from lxml import etree
from datetime import datetime
base_url = 'http://www.lomonosov-fund.ru/enc/ru/encyclopedia'
local_cache = 'cache/'
def load_main():
localfn = local_cache + 'main.html'
load_to_cache(base_url, localfn)
def load_category(cid):
url = base_url + ':' + cid
localfn = local_cache + cid + '.html'
load_to_cache(url, localfn)
def load_to_cache(url, localfn):
request = urllib2.urlopen(url)
content = request.read().decode('windows-1251')
with open(localfn, 'w') as store:
store.write(content.encode('utf-8'))
def parse_main():
localfn = local_cache + 'main.html'
if not os.path.exists(localfn):
load_main(cid)
with open(localfn, 'r') as store:
content = store.read().decode('utf-8')
doc = fromstring(content)
catalog = doc.cssselect('#catalog')[0]
panels = catalog.cssselect('.level-1-panel')
panels = {
('017', u'Наука', ''): panels[0],
('018', u'Искусство, культура и религия', ''): panels[1],
('019', u'Современная Россия', ''): panels[2]
}
def parse_panel(panel):
level2s = panel.cssselect('p > big > a')
l2cats = {}
for l2e in level2s:
parent = l2e.getparent().getparent()
l2id = string.split(l2e.get('href'), ':')[-1]
l2title = l2e.text.strip()
l2desc = parent.text_content().strip()
level3s = parent.getnext().cssselect('ul > li > a')
l2k = (l2id, l2title, l2desc)
l2cats[l2k] = []
for l3e in level3s:
l3title = l3e.text_content().strip()
l3id = string.split(l3e.get('href'), ':')[-1]
l2cats[l2k].append((l3id, l3title, ''))
return l2cats
rootcats = {}
for k, v in panels.iteritems():
rootcats[k] = parse_panel(v)
return rootcats
def parse_content(cid):
def extract_categories(ul):
for a in ul.cssselect('li > a'):
href = a.get('href')
cid = string.split(href, ':')[-1]
title = a.text.strip()
desc = a.get('title')
desc = desc.strip() if desc else ''
yield (cid, title, desc)
localfn = local_cache + cid + '.html'
if not os.path.exists(localfn):
load_category(cid)
with open(localfn, 'r') as store:
content = store.read().decode('utf-8')
doc = fromstring(content)
container = doc.cssselect('.category-binds')[0]
collections = {}
for e in container.cssselect('article > h3'):
collections[e.text_content()] = e.getnext()
subcats = extract_categories(collections[u'↓'])
relcats = extract_categories(collections[u'←'])
parentcat = extract_categories(collections[u'↑'])
#assert len(list(parentcat)) == 1, 'Only one element possible'
return (subcats, relcats)
category_refs = {}
def process(elems, level, rootxml):
for el in elems:
print '-'*level, el[1].encode('utf-8'), '['+el[0]+']'
if el[0] not in category_refs:
subs, refs = parse_content(el[0])
category_refs[el[0]] = [r[0] for r in refs]
subs = list(subs)
isparent = len(subs) > 0
xmlel = append_category_xml(rootxml, isparent, *el)
process(subs, level+1, xmlel)
else:
append_category_xml_ref(rootxml, el[0])
def append_category_xml(root, isparent, cid, name, description):
e = etree.SubElement(root, 'cat', { 'id': cid })
etree.SubElement(e, 'name').text = name
if len(description) > 0 and description != name:
etree.SubElement(e, 'desc').text = description
if (isparent):
return etree.SubElement(e, 'subs')
def append_category_xml_ref(root, ref_id):
return etree.SubElement(root, 'ref', { 'id': ref_id })
def append_category_refs(parent, refs):
relem = etree.SubElement(parent, 'refs')
for r in refs:
append_category_xml_ref(relem, r)
def parse():
root = parse_main()
now = datetime.isoformat(datetime.utcnow())+'Z'
xmlroot = etree.Element('cats', { 'created': now })
for l0, l1s in root.iteritems():
print '-'*1, l0[1].encode('utf-8'), '['+l0[0]+']'
l0xml = append_category_xml(xmlroot, True, *l0)
for l1, l2s in l1s.iteritems():
print '-'*2, l1[1].encode('utf-8'), '['+l1[0]+']'
l1xml = append_category_xml(l0xml, True, *l1)
process(l2s, 3, l1xml)
for k, v in category_refs.iteritems():
if (len(v) > 0):
e = xmlroot.xpath('//cat[@id='+k+']')[0]
append_category_refs(e, v)
with open('lomonosov.xml', 'w') as f:
f.write(etree.tostring(xmlroot, encoding='utf-8', pretty_print=True))
if __name__ == "__main__":
parse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment