-
-
Save moaikim/bfc0ae4d19e0b463cb24da44f0c72faf to your computer and use it in GitHub Desktop.
Google News - Crawling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from apscheduler.schedulers.background import BackgroundScheduler | |
from apscheduler.jobstores.base import JobLookupError | |
import requests | |
import datetime | |
import maya | |
import feedparser | |
import google_news_dbmanager | |
class GoogleNewsCron(): | |
def __init__(self): | |
print ('크론 시작') | |
self.scheduler = BackgroundScheduler(job_defaults={'max_instances': 10, 'coalesce': False}) | |
self.scheduler.start() | |
self.dbManager = google_news_dbmanager.GoogleNewsDBManager() | |
def __del__(self): | |
self.stop() | |
def exec(self, country, keyword): | |
print ('Google News Cron Start: ' + datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) | |
URL = 'https://news.google.com/rss/search?q={}+when:1d'.format(keyword) | |
if country == 'en': | |
URL += '&hl=en-NG&gl=NG&ceid=NG:en' | |
elif country == 'ko': | |
URL += '&hl=ko&gl=KR&ceid=KR:ko' | |
try: | |
res = requests.get(URL) | |
if res.status_code == 200: | |
datas = feedparser.parse(res.text).entries | |
for data in datas: | |
data['published'] = maya.parse(data.published).datetime(to_timezone="Asia/Seoul", naive=True) | |
data['source'] = data.source.title | |
self.dbManager.queryInsertGoogleNewsTable(data) | |
else: | |
print ('Google 검색 에러') | |
except requests.exceptions.RequestException as err: | |
print ('Error Requests: {}'.format(err)) | |
def run(self, mode, country, keyword): | |
print ("실행!") | |
self.dbManager.queryCreateGoogleNewsTable(keyword) | |
self.dbManager.queryCreateKeywordTable() | |
self.dbManager.queryInsertKeywordTable({ | |
'keyword': keyword, | |
'country': country | |
}) | |
if mode == 'once': | |
self.scheduler.add_job(self.exec, args=[country, keyword]) | |
elif mode == 'interval': | |
self.scheduler.add_job(self.exec, 'interval', seconds=10, args=[country, keyword]) | |
elif mode == 'cron': | |
self.scheduler.add_job(self.exec, 'cron', second='*/10', args=[country, keyword]) | |
def stop(self): | |
try: self.scheduler.shutdown() | |
except: pass | |
try: self.dbManager.close() | |
except: pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
class GoogleNewsDBManager: | |
def __init__(self): | |
print ("DB Manager 시작") | |
self.DBName = 'google_news.db' | |
self.db = sqlite3.connect(self.DBName, check_same_thread=False) | |
self.db.row_factory = sqlite3.Row | |
self.google_news_table = 'google_news' | |
self.keyword_table = 'keyword' | |
self.google_news_columns = { | |
'published': 'text', | |
'source': 'text PRIMARY KEY', | |
'title': 'text', | |
'link': 'text', | |
} | |
self.keyword_columns = { | |
'keyword': 'text PRIMARY KEY', | |
'country': 'text', | |
} | |
def __del__(self): | |
self.stop() | |
def stop(self): | |
try: self.db.close() | |
except: pass | |
def queryCreateGoogleNewsTable(self, keyword): | |
self.google_news_table = 'google_news_' + keyword.lower() | |
cursor = self.db.cursor() | |
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.google_news_columns.items()) | |
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.google_news_table, colum_info) | |
cursor.execute(query) | |
self.db.commit() | |
def queryInsertGoogleNewsTable(self, values): | |
cursor = self.db.cursor() | |
colums = ','.join(self.google_news_columns.keys()) | |
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.google_news_columns.keys()) | |
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.google_news_table, colums, values) | |
cursor.execute(query) | |
self.db.commit() | |
def queryDeleteAllGoogleNewsTable(self, keyword): | |
google_news_table = 'google_news_' + keyword.lower() | |
query = "DROP TABLE IF EXISTS {}".format(google_news_table) | |
cursor = self.db.cursor() | |
cursor.execute(query) | |
self.db.commit() | |
def querySelectAllGoogleNewsTable(self, keyword): | |
google_news_table = 'google_news_' + keyword.lower() | |
query = "SELECT * FROM {}".format(google_news_table) | |
cursor = self.db.cursor() | |
cursor.execute(query) | |
return cursor.fetchall() | |
def queryCreateKeywordTable(self): | |
cursor = self.db.cursor() | |
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.keyword_columns.items()) | |
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.keyword_table, colum_info) | |
cursor.execute(query) | |
self.db.commit() | |
def queryInsertKeywordTable(self, values): | |
cursor = self.db.cursor() | |
colums = ','.join(self.keyword_columns.keys()) | |
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.keyword_columns.keys()) | |
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.keyword_table, colums, values) | |
cursor.execute(query) | |
self.db.commit() | |
def queryDeleteKeywordTable(self, keyword): | |
cursor = self.db.cursor() | |
query = "DELETE FROM {} WHERE KEYWORD='{}'".format(self.keyword_table, keyword) | |
cursor.execute(query) | |
self.db.commit() | |
def querySelectAllKeywordTable(self): | |
query = "SELECT * FROM {}".format(self.keyword_table) | |
cursor = self.db.cursor() | |
cursor.execute(query) | |
return cursor.fetchall() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import argparse | |
import google_news_cron | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('mode', type=str, choices=['once','interval','cron'], default='once', help="Choose how you want to run the code") | |
parser.add_argument('--country', type=str, required=False, default='en', choices=['en','ko'], help="Which country will you search for news?") | |
parser.add_argument('--keyword', type=str, required=False, default='all', help="Enter keywords to crawl") | |
args = parser.parse_args() | |
try: | |
gooleNewsCron = google_news_cron.GoogleNewsCron() | |
gooleNewsCron.run(args.mode, args.country, args.keyword) | |
while True: | |
time.sleep(1) | |
except KeyboardInterrupt: | |
gooleNewsCron.stop() | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment