Skip to content

Instantly share code, notes, and snippets.

@moaikim
Created November 27, 2020 16:22
Show Gist options
  • Save moaikim/bfc0ae4d19e0b463cb24da44f0c72faf to your computer and use it in GitHub Desktop.
Save moaikim/bfc0ae4d19e0b463cb24da44f0c72faf to your computer and use it in GitHub Desktop.
Google News - Crawling
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.base import JobLookupError
import requests
import datetime
import maya
import feedparser
import google_news_dbmanager
class GoogleNewsCron():
def __init__(self):
print ('크론 시작')
self.scheduler = BackgroundScheduler(job_defaults={'max_instances': 10, 'coalesce': False})
self.scheduler.start()
self.dbManager = google_news_dbmanager.GoogleNewsDBManager()
def __del__(self):
self.stop()
def exec(self, country, keyword):
print ('Google News Cron Start: ' + datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
URL = 'https://news.google.com/rss/search?q={}+when:1d'.format(keyword)
if country == 'en':
URL += '&hl=en-NG&gl=NG&ceid=NG:en'
elif country == 'ko':
URL += '&hl=ko&gl=KR&ceid=KR:ko'
try:
res = requests.get(URL)
if res.status_code == 200:
datas = feedparser.parse(res.text).entries
for data in datas:
data['published'] = maya.parse(data.published).datetime(to_timezone="Asia/Seoul", naive=True)
data['source'] = data.source.title
self.dbManager.queryInsertGoogleNewsTable(data)
else:
print ('Google 검색 에러')
except requests.exceptions.RequestException as err:
print ('Error Requests: {}'.format(err))
def run(self, mode, country, keyword):
print ("실행!")
self.dbManager.queryCreateGoogleNewsTable(keyword)
self.dbManager.queryCreateKeywordTable()
self.dbManager.queryInsertKeywordTable({
'keyword': keyword,
'country': country
})
if mode == 'once':
self.scheduler.add_job(self.exec, args=[country, keyword])
elif mode == 'interval':
self.scheduler.add_job(self.exec, 'interval', seconds=10, args=[country, keyword])
elif mode == 'cron':
self.scheduler.add_job(self.exec, 'cron', second='*/10', args=[country, keyword])
def stop(self):
try: self.scheduler.shutdown()
except: pass
try: self.dbManager.close()
except: pass
import sqlite3
class GoogleNewsDBManager:
def __init__(self):
print ("DB Manager 시작")
self.DBName = 'google_news.db'
self.db = sqlite3.connect(self.DBName, check_same_thread=False)
self.db.row_factory = sqlite3.Row
self.google_news_table = 'google_news'
self.keyword_table = 'keyword'
self.google_news_columns = {
'published': 'text',
'source': 'text PRIMARY KEY',
'title': 'text',
'link': 'text',
}
self.keyword_columns = {
'keyword': 'text PRIMARY KEY',
'country': 'text',
}
def __del__(self):
self.stop()
def stop(self):
try: self.db.close()
except: pass
def queryCreateGoogleNewsTable(self, keyword):
self.google_news_table = 'google_news_' + keyword.lower()
cursor = self.db.cursor()
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.google_news_columns.items())
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.google_news_table, colum_info)
cursor.execute(query)
self.db.commit()
def queryInsertGoogleNewsTable(self, values):
cursor = self.db.cursor()
colums = ','.join(self.google_news_columns.keys())
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.google_news_columns.keys())
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.google_news_table, colums, values)
cursor.execute(query)
self.db.commit()
def queryDeleteAllGoogleNewsTable(self, keyword):
google_news_table = 'google_news_' + keyword.lower()
query = "DROP TABLE IF EXISTS {}".format(google_news_table)
cursor = self.db.cursor()
cursor.execute(query)
self.db.commit()
def querySelectAllGoogleNewsTable(self, keyword):
google_news_table = 'google_news_' + keyword.lower()
query = "SELECT * FROM {}".format(google_news_table)
cursor = self.db.cursor()
cursor.execute(query)
return cursor.fetchall()
def queryCreateKeywordTable(self):
cursor = self.db.cursor()
colum_info = ",".join(col_name + ' ' + col_type for col_name, col_type in self.keyword_columns.items())
query = "CREATE TABLE IF NOT EXISTS {} ({})".format(self.keyword_table, colum_info)
cursor.execute(query)
self.db.commit()
def queryInsertKeywordTable(self, values):
cursor = self.db.cursor()
colums = ','.join(self.keyword_columns.keys())
values = '","'.join(str(values[col_name]).replace('"',"'") for col_name in self.keyword_columns.keys())
query = 'INSERT OR IGNORE INTO {} ({}) VALUES ("{}")'.format(self.keyword_table, colums, values)
cursor.execute(query)
self.db.commit()
def queryDeleteKeywordTable(self, keyword):
cursor = self.db.cursor()
query = "DELETE FROM {} WHERE KEYWORD='{}'".format(self.keyword_table, keyword)
cursor.execute(query)
self.db.commit()
def querySelectAllKeywordTable(self):
query = "SELECT * FROM {}".format(self.keyword_table)
cursor = self.db.cursor()
cursor.execute(query)
return cursor.fetchall()
import time
import argparse
import google_news_cron
def main():
parser = argparse.ArgumentParser()
parser.add_argument('mode', type=str, choices=['once','interval','cron'], default='once', help="Choose how you want to run the code")
parser.add_argument('--country', type=str, required=False, default='en', choices=['en','ko'], help="Which country will you search for news?")
parser.add_argument('--keyword', type=str, required=False, default='all', help="Enter keywords to crawl")
args = parser.parse_args()
try:
gooleNewsCron = google_news_cron.GoogleNewsCron()
gooleNewsCron.run(args.mode, args.country, args.keyword)
while True:
time.sleep(1)
except KeyboardInterrupt:
gooleNewsCron.stop()
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment