minhoryang/Newsroom.py

## a_href.py
#!py3env/bin/python
import requests
import sys
import pprint
import bs4


def get(url):
    a = []
    for i in get_html(url).find_all('a'):
        if i.get('href'):
            a.append(i['href'])
    a = list(set(a))
    a.sort()
    return a

def get_html(url):
    return bs4.BeautifulSoup(requests.get(url).content)


if __name__ == "__main__":
    pprint.pprint(get(sys.argv[1]))

## Newsroom.py
#!py3env/bin/python
import os
import re
import sys

import a_href


##############################################3
from html.parser import HTMLParser
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()
##############################################3


class Newssite:
    ARTICLE_URL_ID_REGEX = ""
    ARTICLE_HTML_RULE = ""
    ARTICLE_HTML_SEARCH_METHOD = None
    DOMAIN = ""
    FOLDERNAME = ""

    def rule_for_class(self, t):
        return t.find(attrs={'class':self.ARTICLE_HTML_RULE})

    def rule_for_id(self, t):
        return t.find(id=self.ARTICLE_HTML_RULE)

    def _folder_location(self, article_id, name):
        return os.path.expanduser("~/RUNNING/TXT/" + name + "/" + article_id + ".txt")

    def _is_article(self, article_url):
        return re.findall(self.ARTICLE_URL_ID_REGEX, article_url)

    def _is_already_got(self, article_id):
        return os.path.isfile(self._folder_location(article_id, self.FOLDERNAME))

    def _guess_full_url(self, article_url):
        if 'javascript:' in article_url:
            if 'http' in article_url:
                for i in article_url.split("'"):
                    if 'http' in i:
                        return i
                for i in article_url.split('"'):
                    if 'http' in i:
                        return i
            return None
        elif not 'http' in article_url:
            return self.DOMAIN + article_url
        return article_url

    def _extract_and_save_text(self, article_id, article_url):
        try:
            t = self.ARTICLE_HTML_SEARCH_METHOD(a_href.get_html(article_url))
            if t:
                s = strip_tags(t.text)
                print(type(t))
                with open(self._folder_location(article_id, self.FOLDERNAME), "w") as i:
                    i.write(s)
            else:
                print(article_id + " is not fit for _extract")
        except Exception as e:
            print(e)

    def _what_woman_wanted(self, tags):
        s = []
        for i in tags.children:
            if isinstance(i, bs4.element.Tag):
                pass
            else:
                s.append(i.text)
        pass

    def crawl(self):
        for article_url in a_href.get(self.DOMAIN):
            found_article_id = self._is_article(article_url)
            if len(found_article_id):
                found_article_id = found_article_id[0]
                if not self._is_already_got(found_article_id):
                    url = self._guess_full_url(article_url)
                    print(url)
                    if url:
                        self._extract_and_save_text(found_article_id, url)


class Haninews(Newssite):
    ARTICLE_URL_ID_REGEX = ".*/arti/.*/([0-9]*).*?_fr=.*"
    ARTICLE_HTML_RULE = "article-contents"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://www.hani.co.kr/"
    FOLDERNAME = "hani"


class Khannews(Newssite):
    ARTICLE_URL_ID_REGEX = ".*news.khan.co.kr.*artid=([0-9]*).*"
    ARTICLE_HTML_RULE = "_article"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
    DOMAIN = "http://www.khan.co.kr/"
    FOLDERNAME = "khan"


class Kukinews(Newssite):
    ARTICLE_URL_ID_RULE = ".*news.kukinews.com.*arcid=([0-9]*).*"
    ARTICLE_HTML_RULE = "_article"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
    DOMAIN = "http://www.kukinews.com/"
    FOLDERNAME = "kukinews"


class Ohmynews(Newssite):
    ARTICLE_URL_ID_REGEX = ".*at_pg.*CNTN_CD=(A[0-9]*).*"
    ARTICLE_HTML_RULE = "at_contents"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://www.ohmynews.com/"
    FOLDERNAME = "ohmynews"


class Joongang(Newssite):
    ARTICLE_URL_ID_REGEX = ".*article.*/([0-9]*)\.html.*"
    ARTICLE_HTML_RULE = "article_content"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://joongang.joins.com/"
    FOLDERNAME = "joongang"


class Isplus(Joongang):
    DOMAIN = "http://isplus.joins.com"
    FOLDERNAME = "isplus"


class Cosmopolitan(Newssite):
    ARTICLE_URL_ID_REGEX = ".*strArtclCd=(A[0-9]*).*"
    ARTICLE_HTML_RULE = "clip_wrap"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://cosmopolitan.joins.com/"
    FOLDERNAME = "cosmopolitan"


class Cosmopolitan2(Cosmopolitan):
    ARTICLE_HTML_RULE = "clip_wrap_none"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class


class Seoul(Newssite):
    ARTICLE_URL_ID_REGEX = ".*newsView.*id=([0-9]*).*"
    ARTICLE_HTML_RULE = "atic_txt1"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://www.seoul.co.kr/"
    FOLDERNAME = "seoul"


class Asiae(Newssite):
    ARTICLE_URL_ID_REGEX = ".*idxno=([0-9]*).*"
    ARTICLE_HTML_RULE = "article"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://www.asiae.co.kr/news/"
    FOLDERNAME = "asiae"


class Hankyung(Newssite):
    ARTICLE_URL_ID_REGEX = ".*newsview\.php\?aid=([0-9a-z]*).*"
    ARTICLE_HTML_RULE = "articleTxt"
    ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
    DOMAIN = "http://www.hankyung.com/"
    FOLDERNAME = "hankyung"


class Unknown():
    def crawl(self):
        print("Unknown Source")


classmap = {
        "Haninews":Haninews(),
        "Khannews":Khannews(),
        "Kukinews":Kukinews(),
        "Ohmynews":Ohmynews(),
        "Joongang":Joongang(),
        "Isplus":Isplus(),
        "Cosmopolitan":Cosmopolitan(),
        "Cosmopolitan2":Cosmopolitan2(),
        "Seoul":Seoul(),
        "Asiae":Asiae(),
        "Hankyung":Hankyung()
}

if __name__ == "__main__":
    classmap.get(sys.argv[1], Unknown()).crawl()
	#!py3env/bin/python
	import requests
	import sys
	import pprint
	import bs4


	def get(url):
	a = []
	for i in get_html(url).find_all('a'):
	if i.get('href'):
	a.append(i['href'])
	a = list(set(a))
	a.sort()
	return a

	def get_html(url):
	return bs4.BeautifulSoup(requests.get(url).content)


	if __name__ == "__main__":
	pprint.pprint(get(sys.argv[1]))
	#!py3env/bin/python
	import os
	import re
	import sys

	import a_href


	##############################################3
	from html.parser import HTMLParser
	class MLStripper(HTMLParser):
	def __init__(self):
	super().__init__()
	self.reset()
	self.fed = []
	def handle_data(self, d):
	self.fed.append(d)
	def get_data(self):
	return ''.join(self.fed)

	def strip_tags(html):
	s = MLStripper()
	s.feed(html)
	return s.get_data()
	##############################################3


	class Newssite:
	ARTICLE_URL_ID_REGEX = ""
	ARTICLE_HTML_RULE = ""
	ARTICLE_HTML_SEARCH_METHOD = None
	DOMAIN = ""
	FOLDERNAME = ""

	def rule_for_class(self, t):
	return t.find(attrs={'class':self.ARTICLE_HTML_RULE})

	def rule_for_id(self, t):
	return t.find(id=self.ARTICLE_HTML_RULE)

	def _folder_location(self, article_id, name):
	return os.path.expanduser("~/RUNNING/TXT/" + name + "/" + article_id + ".txt")

	def _is_article(self, article_url):
	return re.findall(self.ARTICLE_URL_ID_REGEX, article_url)

	def _is_already_got(self, article_id):
	return os.path.isfile(self._folder_location(article_id, self.FOLDERNAME))

	def _guess_full_url(self, article_url):
	if 'javascript:' in article_url:
	if 'http' in article_url:
	for i in article_url.split("'"):
	if 'http' in i:
	return i
	for i in article_url.split('"'):
	if 'http' in i:
	return i
	return None
	elif not 'http' in article_url:
	return self.DOMAIN + article_url
	return article_url

	def _extract_and_save_text(self, article_id, article_url):
	try:
	t = self.ARTICLE_HTML_SEARCH_METHOD(a_href.get_html(article_url))
	if t:
	s = strip_tags(t.text)
	print(type(t))
	with open(self._folder_location(article_id, self.FOLDERNAME), "w") as i:
	i.write(s)
	else:
	print(article_id + " is not fit for _extract")
	except Exception as e:
	print(e)

	def _what_woman_wanted(self, tags):
	s = []
	for i in tags.children:
	if isinstance(i, bs4.element.Tag):
	pass
	else:
	s.append(i.text)
	pass

	def crawl(self):
	for article_url in a_href.get(self.DOMAIN):
	found_article_id = self._is_article(article_url)
	if len(found_article_id):
	found_article_id = found_article_id[0]
	if not self._is_already_got(found_article_id):
	url = self._guess_full_url(article_url)
	print(url)
	if url:
	self._extract_and_save_text(found_article_id, url)


	class Haninews(Newssite):
	ARTICLE_URL_ID_REGEX = "./arti/./([0-9]).?_fr=.*"
	ARTICLE_HTML_RULE = "article-contents"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://www.hani.co.kr/"
	FOLDERNAME = "hani"


	class Khannews(Newssite):
	ARTICLE_URL_ID_REGEX = ".news.khan.co.kr.artid=([0-9])."
	ARTICLE_HTML_RULE = "_article"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
	DOMAIN = "http://www.khan.co.kr/"
	FOLDERNAME = "khan"


	class Kukinews(Newssite):
	ARTICLE_URL_ID_RULE = ".news.kukinews.com.arcid=([0-9])."
	ARTICLE_HTML_RULE = "_article"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_id
	DOMAIN = "http://www.kukinews.com/"
	FOLDERNAME = "kukinews"


	class Ohmynews(Newssite):
	ARTICLE_URL_ID_REGEX = ".at_pg.CNTN_CD=(A[0-9])."
	ARTICLE_HTML_RULE = "at_contents"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://www.ohmynews.com/"
	FOLDERNAME = "ohmynews"


	class Joongang(Newssite):
	ARTICLE_URL_ID_REGEX = ".article./([0-9])\.html."
	ARTICLE_HTML_RULE = "article_content"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://joongang.joins.com/"
	FOLDERNAME = "joongang"


	class Isplus(Joongang):
	DOMAIN = "http://isplus.joins.com"
	FOLDERNAME = "isplus"


	class Cosmopolitan(Newssite):
	ARTICLE_URL_ID_REGEX = ".strArtclCd=(A[0-9]).*"
	ARTICLE_HTML_RULE = "clip_wrap"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://cosmopolitan.joins.com/"
	FOLDERNAME = "cosmopolitan"


	class Cosmopolitan2(Cosmopolitan):
	ARTICLE_HTML_RULE = "clip_wrap_none"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class


	class Seoul(Newssite):
	ARTICLE_URL_ID_REGEX = ".newsView.id=([0-9])."
	ARTICLE_HTML_RULE = "atic_txt1"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://www.seoul.co.kr/"
	FOLDERNAME = "seoul"


	class Asiae(Newssite):
	ARTICLE_URL_ID_REGEX = ".idxno=([0-9]).*"
	ARTICLE_HTML_RULE = "article"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://www.asiae.co.kr/news/"
	FOLDERNAME = "asiae"


	class Hankyung(Newssite):
	ARTICLE_URL_ID_REGEX = ".newsview\.php\?aid=([0-9a-z]).*"
	ARTICLE_HTML_RULE = "articleTxt"
	ARTICLE_HTML_SEARCH_METHOD = Newssite.rule_for_class
	DOMAIN = "http://www.hankyung.com/"
	FOLDERNAME = "hankyung"


	class Unknown():
	def crawl(self):
	print("Unknown Source")


	classmap = {
	"Haninews":Haninews(),
	"Khannews":Khannews(),
	"Kukinews":Kukinews(),
	"Ohmynews":Ohmynews(),
	"Joongang":Joongang(),
	"Isplus":Isplus(),
	"Cosmopolitan":Cosmopolitan(),
	"Cosmopolitan2":Cosmopolitan2(),
	"Seoul":Seoul(),
	"Asiae":Asiae(),
	"Hankyung":Hankyung()
	}

	if __name__ == "__main__":
	classmap.get(sys.argv[1], Unknown()).crawl()