Skip to content

Instantly share code, notes, and snippets.

@beratdogan
Last active August 29, 2015 14:23
Show Gist options
  • Save beratdogan/980e1bb0fe49c5f3dc59 to your computer and use it in GitHub Desktop.
Save beratdogan/980e1bb0fe49c5f3dc59 to your computer and use it in GitHub Desktop.
parsing sabah.com.tr's rss
import requests
from urlparse import urlparse, urljoin
from lxml import html, etree
from lxml.cssselect import CSSSelector
def get_content(url):
return requests.get(url)
def get_selector(css_selector):
return CSSSelector(css_selector)
def get_dom_tree(html):
return etree.fromstring(html)
def filter_nodes(dom, selector):
return selector(dom)
def get_rss_dom_and_item_selector():
rss_contents = get_content('http://www.milliyet.com.tr/D/rss/rss/Rss_3.xml')
rss_dom = get_dom_tree(rss_contents.content)
item_selector = get_selector('channel > item')
return rss_dom, item_selector
def get_news(*args):
for item in filter_nodes(*args):
yield {variable: item.find(variable).text
for variable in ['title', 'link', 'pubDate']}
def main():
rss_dom, item_selector = get_rss_dom_and_item_selector()
news_generator = get_news(rss_dom, item_selector)
import ipdb; ipdb.set_trace()
if __name__ == '__main__':
main()
cssselect==0.9.1
gnureadline==6.3.3
ipdb==0.8.1
ipython==3.2.0
lxml==3.4.4
requests==2.7.0
wsgiref==0.1.2
import requests
from urlparse import urlparse, urljoin
from lxml import html, etree
from lxml.cssselect import CSSSelector
def get_content(url):
return requests.get(url)
def get_selector(css_selector):
return CSSSelector(css_selector)
def get_dom_tree(html):
return etree.fromstring(html)
def filter_nodes(dom, selector):
return selector(dom)
def get_rss_dom_and_item_selector():
rss_contents = get_content('http://www.sabah.com.tr/rss/ekonomi.xml')
rss_dom = get_dom_tree(rss_contents.text)
item_selector = get_selector('channel > item')
return rss_dom, item_selector
def get_news(*args):
for item in filter_nodes(*args):
yield {variable: item.find(variable).text
for variable in ['title', 'link', 'category', 'pubDate']}
def main():
news_generator = get_news(*get_rss_dom_and_item_selector())
import ipdb; ipdb.set_trace()
# call news_generator.next()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment