Last active
August 29, 2015 14:23
-
-
Save beratdogan/980e1bb0fe49c5f3dc59 to your computer and use it in GitHub Desktop.
parsing sabah.com.tr's rss
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urlparse import urlparse, urljoin | |
from lxml import html, etree | |
from lxml.cssselect import CSSSelector | |
def get_content(url): | |
return requests.get(url) | |
def get_selector(css_selector): | |
return CSSSelector(css_selector) | |
def get_dom_tree(html): | |
return etree.fromstring(html) | |
def filter_nodes(dom, selector): | |
return selector(dom) | |
def get_rss_dom_and_item_selector(): | |
rss_contents = get_content('http://www.milliyet.com.tr/D/rss/rss/Rss_3.xml') | |
rss_dom = get_dom_tree(rss_contents.content) | |
item_selector = get_selector('channel > item') | |
return rss_dom, item_selector | |
def get_news(*args): | |
for item in filter_nodes(*args): | |
yield {variable: item.find(variable).text | |
for variable in ['title', 'link', 'pubDate']} | |
def main(): | |
rss_dom, item_selector = get_rss_dom_and_item_selector() | |
news_generator = get_news(rss_dom, item_selector) | |
import ipdb; ipdb.set_trace() | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cssselect==0.9.1 | |
gnureadline==6.3.3 | |
ipdb==0.8.1 | |
ipython==3.2.0 | |
lxml==3.4.4 | |
requests==2.7.0 | |
wsgiref==0.1.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urlparse import urlparse, urljoin | |
from lxml import html, etree | |
from lxml.cssselect import CSSSelector | |
def get_content(url): | |
return requests.get(url) | |
def get_selector(css_selector): | |
return CSSSelector(css_selector) | |
def get_dom_tree(html): | |
return etree.fromstring(html) | |
def filter_nodes(dom, selector): | |
return selector(dom) | |
def get_rss_dom_and_item_selector(): | |
rss_contents = get_content('http://www.sabah.com.tr/rss/ekonomi.xml') | |
rss_dom = get_dom_tree(rss_contents.text) | |
item_selector = get_selector('channel > item') | |
return rss_dom, item_selector | |
def get_news(*args): | |
for item in filter_nodes(*args): | |
yield {variable: item.find(variable).text | |
for variable in ['title', 'link', 'category', 'pubDate']} | |
def main(): | |
news_generator = get_news(*get_rss_dom_and_item_selector()) | |
import ipdb; ipdb.set_trace() | |
# call news_generator.next() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment