Created
August 14, 2015 09:17
-
-
Save minhoryang/aa6d0071d22c27917ba4 to your computer and use it in GitHub Desktop.
Korean News Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!py3env/bin/python | |
import requests | |
import sys | |
import pprint | |
import bs4 | |
def get(url): | |
a = [] | |
for i in get_html(url).find_all('a'): | |
if i.get('href'): | |
a.append(i['href']) | |
a = list(set(a)) | |
a.sort() | |
return a | |
def get_html(url): | |
return bs4.BeautifulSoup(requests.get(url).content) | |
if __name__ == "__main__": | |
pprint.pprint(get(sys.argv[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!py3env/bin/python | |
import os | |
import re | |
import sys | |
import a_href | |
##############################################3 | |
from html.parser import HTMLParser | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
super().__init__() | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
##############################################3 | |
class Newssite: | |
ARTICLE_URL_ID_REGEX = "" | |
ARTICLE_HTML_RULE = "" | |
ARTICLE_HTML_SEARCH_METHOD = None | |
DOMAIN = "" | |
FOLDERNAME = "" | |
def rule_for_class(self, t): | |
return t.find(attrs={'class':self.ARTICLE_HTML_RULE}) | |
def rule_for_id(self, t): | |
return t.find(id=self.ARTICLE_HTML_RULE) | |
def _folder_location(self, article_id, name): | |
return os.path.expanduser("~/RUNNING/TXT/" + name + "/" + article_id + ".txt") | |
def _is_article(self, article_url): | |
return re.findall(self.ARTICLE_URL_ID_REGEX, article_url) | |
def _is_already_got(self, article_id): | |
return os.path.isfile(self._folder_location(article_id, self.FOLDERNAME)) | |
def _guess_full_url(self, article_url): | |
if 'javascript:' in article_url: | |
if 'http' in article_url: | |
for i in article_url.split("'"): | |
if 'http' in i: | |
return i | |
for i in article_url.split('"'): | |
if 'http' in i: | |
return i | |
return None | |
elif not 'http' in article_url: | |
return self.DOMAIN + article_url | |
return article_url | |
def _extract_and_save_text(self, article_id, article_url): | |
try: | |
t = self.ARTICLE_HTML_SEARCH_METHOD(a_href.get_html(article_url)) | |
if t: | |
s = strip_tags(t.text) | |
print(type(t)) | |
with open(self._folder_location(article_id, self.FOLDERNAME), "w") as i: | |
i.write(s) | |
else: | |
print(article_id + " is not fit for _extract") | |
except Exception as e: | |
print(e) | |
def _what_woman_wanted(self, tags): | |
s = [] | |
for i in tags.children: | |
if isinstance(i, bs4.element.Tag): | |
pass | |
else: | |
s.append(i.text) | |
pass | |
def crawl(self): | |
for article_url in a_href.get(self.DOMAIN): | |
found_article_id = self._is_article(article_url) | |
if len(found_article_id): | |
found_article_id = found_article_id[0] | |
if not self._is_already_got(found_article_id): | |
url = self._guess_full_url(article_url) | |
print(url) | |
if url: | |
self._extract_and_save_text(found_article_id, url) | |
class Haninews(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*/arti/.*/([0-9]*).*?_fr=.*" | |
ARTICLE_HTML_RULE = "article-contents" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://www.hani.co.kr/" | |
FOLDERNAME = "hani" | |
class Khannews(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*news.khan.co.kr.*artid=([0-9]*).*" | |
ARTICLE_HTML_RULE = "_article" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id | |
DOMAIN = "http://www.khan.co.kr/" | |
FOLDERNAME = "khan" | |
class Kukinews(Newssite): | |
ARTICLE_URL_ID_RULE = ".*news.kukinews.com.*arcid=([0-9]*).*" | |
ARTICLE_HTML_RULE = "_article" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id | |
DOMAIN = "http://www.kukinews.com/" | |
FOLDERNAME = "kukinews" | |
class Ohmynews(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*at_pg.*CNTN_CD=(A[0-9]*).*" | |
ARTICLE_HTML_RULE = "at_contents" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://www.ohmynews.com/" | |
FOLDERNAME = "ohmynews" | |
class Joongang(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*article.*/([0-9]*)\.html.*" | |
ARTICLE_HTML_RULE = "article_content" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://joongang.joins.com/" | |
FOLDERNAME = "joongang" | |
class Isplus(Joongang): | |
DOMAIN = "http://isplus.joins.com" | |
FOLDERNAME = "isplus" | |
class Cosmopolitan(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*strArtclCd=(A[0-9]*).*" | |
ARTICLE_HTML_RULE = "clip_wrap" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://cosmopolitan.joins.com/" | |
FOLDERNAME = "cosmopolitan" | |
class Cosmopolitan2(Cosmopolitan): | |
ARTICLE_HTML_RULE = "clip_wrap_none" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
class Seoul(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*newsView.*id=([0-9]*).*" | |
ARTICLE_HTML_RULE = "atic_txt1" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://www.seoul.co.kr/" | |
FOLDERNAME = "seoul" | |
class Asiae(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*idxno=([0-9]*).*" | |
ARTICLE_HTML_RULE = "article" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://www.asiae.co.kr/news/" | |
FOLDERNAME = "asiae" | |
class Hankyung(Newssite): | |
ARTICLE_URL_ID_REGEX = ".*newsview\.php\?aid=([0-9a-z]*).*" | |
ARTICLE_HTML_RULE = "articleTxt" | |
ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class | |
DOMAIN = "http://www.hankyung.com/" | |
FOLDERNAME = "hankyung" | |
class Unknown(): | |
def crawl(self): | |
print("Unknown Source") | |
classmap = { | |
"Haninews":Haninews(), | |
"Khannews":Khannews(), | |
"Kukinews":Kukinews(), | |
"Ohmynews":Ohmynews(), | |
"Joongang":Joongang(), | |
"Isplus":Isplus(), | |
"Cosmopolitan":Cosmopolitan(), | |
"Cosmopolitan2":Cosmopolitan2(), | |
"Seoul":Seoul(), | |
"Asiae":Asiae(), | |
"Hankyung":Hankyung() | |
} | |
if __name__ == "__main__": | |
classmap.get(sys.argv[1], Unknown()).crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment