Skip to content

Instantly share code, notes, and snippets.

@minhoryang
Created August 14, 2015 09:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save minhoryang/aa6d0071d22c27917ba4 to your computer and use it in GitHub Desktop.
Save minhoryang/aa6d0071d22c27917ba4 to your computer and use it in GitHub Desktop.
Korean News Crawler
#!py3env/bin/python
import requests
import sys
import pprint
import bs4
def get(url):
a = []
for i in get_html(url).find_all('a'):
if i.get('href'):
a.append(i['href'])
a = list(set(a))
a.sort()
return a
def get_html(url):
return bs4.BeautifulSoup(requests.get(url).content)
if __name__ == "__main__":
pprint.pprint(get(sys.argv[1]))
#!py3env/bin/python
import os
import re
import sys
import a_href
##############################################3
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
##############################################3
class Newssite:
ARTICLE_URL_ID_REGEX = ""
ARTICLE_HTML_RULE = ""
ARTICLE_HTML_SEARCH_METHOD = None
DOMAIN = ""
FOLDERNAME = ""
def rule_for_class(self, t):
return t.find(attrs={'class':self.ARTICLE_HTML_RULE})
def rule_for_id(self, t):
return t.find(id=self.ARTICLE_HTML_RULE)
def _folder_location(self, article_id, name):
return os.path.expanduser("~/RUNNING/TXT/" + name + "/" + article_id + ".txt")
def _is_article(self, article_url):
return re.findall(self.ARTICLE_URL_ID_REGEX, article_url)
def _is_already_got(self, article_id):
return os.path.isfile(self._folder_location(article_id, self.FOLDERNAME))
def _guess_full_url(self, article_url):
if 'javascript:' in article_url:
if 'http' in article_url:
for i in article_url.split("'"):
if 'http' in i:
return i
for i in article_url.split('"'):
if 'http' in i:
return i
return None
elif not 'http' in article_url:
return self.DOMAIN + article_url
return article_url
def _extract_and_save_text(self, article_id, article_url):
try:
t = self.ARTICLE_HTML_SEARCH_METHOD(a_href.get_html(article_url))
if t:
s = strip_tags(t.text)
print(type(t))
with open(self._folder_location(article_id, self.FOLDERNAME), "w") as i:
i.write(s)
else:
print(article_id + " is not fit for _extract")
except Exception as e:
print(e)
def _what_woman_wanted(self, tags):
s = []
for i in tags.children:
if isinstance(i, bs4.element.Tag):
pass
else:
s.append(i.text)
pass
def crawl(self):
for article_url in a_href.get(self.DOMAIN):
found_article_id = self._is_article(article_url)
if len(found_article_id):
found_article_id = found_article_id[0]
if not self._is_already_got(found_article_id):
url = self._guess_full_url(article_url)
print(url)
if url:
self._extract_and_save_text(found_article_id, url)
class Haninews(Newssite):
ARTICLE_URL_ID_REGEX = ".*/arti/.*/([0-9]*).*?_fr=.*"
ARTICLE_HTML_RULE = "article-contents"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://www.hani.co.kr/"
FOLDERNAME = "hani"
class Khannews(Newssite):
ARTICLE_URL_ID_REGEX = ".*news.khan.co.kr.*artid=([0-9]*).*"
ARTICLE_HTML_RULE = "_article"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
DOMAIN = "http://www.khan.co.kr/"
FOLDERNAME = "khan"
class Kukinews(Newssite):
ARTICLE_URL_ID_RULE = ".*news.kukinews.com.*arcid=([0-9]*).*"
ARTICLE_HTML_RULE = "_article"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
DOMAIN = "http://www.kukinews.com/"
FOLDERNAME = "kukinews"
class Ohmynews(Newssite):
ARTICLE_URL_ID_REGEX = ".*at_pg.*CNTN_CD=(A[0-9]*).*"
ARTICLE_HTML_RULE = "at_contents"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://www.ohmynews.com/"
FOLDERNAME = "ohmynews"
class Joongang(Newssite):
ARTICLE_URL_ID_REGEX = ".*article.*/([0-9]*)\.html.*"
ARTICLE_HTML_RULE = "article_content"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://joongang.joins.com/"
FOLDERNAME = "joongang"
class Isplus(Joongang):
DOMAIN = "http://isplus.joins.com"
FOLDERNAME = "isplus"
class Cosmopolitan(Newssite):
ARTICLE_URL_ID_REGEX = ".*strArtclCd=(A[0-9]*).*"
ARTICLE_HTML_RULE = "clip_wrap"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://cosmopolitan.joins.com/"
FOLDERNAME = "cosmopolitan"
class Cosmopolitan2(Cosmopolitan):
ARTICLE_HTML_RULE = "clip_wrap_none"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
class Seoul(Newssite):
ARTICLE_URL_ID_REGEX = ".*newsView.*id=([0-9]*).*"
ARTICLE_HTML_RULE = "atic_txt1"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://www.seoul.co.kr/"
FOLDERNAME = "seoul"
class Asiae(Newssite):
ARTICLE_URL_ID_REGEX = ".*idxno=([0-9]*).*"
ARTICLE_HTML_RULE = "article"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://www.asiae.co.kr/news/"
FOLDERNAME = "asiae"
class Hankyung(Newssite):
ARTICLE_URL_ID_REGEX = ".*newsview\.php\?aid=([0-9a-z]*).*"
ARTICLE_HTML_RULE = "articleTxt"
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
DOMAIN = "http://www.hankyung.com/"
FOLDERNAME = "hankyung"
class Unknown():
def crawl(self):
print("Unknown Source")
classmap = {
"Haninews":Haninews(),
"Khannews":Khannews(),
"Kukinews":Kukinews(),
"Ohmynews":Ohmynews(),
"Joongang":Joongang(),
"Isplus":Isplus(),
"Cosmopolitan":Cosmopolitan(),
"Cosmopolitan2":Cosmopolitan2(),
"Seoul":Seoul(),
"Asiae":Asiae(),
"Hankyung":Hankyung()
}
if __name__ == "__main__":
classmap.get(sys.argv[1], Unknown()).crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment