Skip to content

Instantly share code, notes, and snippets.

@dwillis
Created June 8, 2023 23:42
Show Gist options
  • Save dwillis/7e6a2571d64688243879ed349e88787c to your computer and use it in GitHub Desktop.
Save dwillis/7e6a2571d64688243879ed349e88787c to your computer and use it in GitHub Desktop.
import json
import time
from sqlite_utils import Database
from newspaper import Article, fulltext
def create_db(source):
rows = []
db_file = f"{source.lower()}.db"
table_name = f"{source.lower()}_stories"
db = Database(db_file)
urls_file = f"{source.lower()}_urls.json"
urls = json.loads(open(urls_file).read())
for url in urls:
if '/video/' in url:
continue
if url == 'https://www.comparecards.com/':
continue
if '/live-news/january-6-hearings' in url:
continue
if 'cnn-underscored' in url:
continue
if source == 'CNN' and 'cnn.com' not in url:
continue
print(url)
row = {}
article = Article(url)
article.download()
try:
article.parse()
except:
continue
row['source'] = source
row['url'] = url
row['publish_date'] = str(article.publish_date)
row['title'] = article.title
row['authors'] = article.authors
row['text'] = article.text
rows.append(row)
time.sleep(0.1)
db[table_name].insert_all(rows, pk="url")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment