Skip to content

Instantly share code, notes, and snippets.

@khsing
Created January 15, 2019 08:38
Show Gist options
  • Save khsing/2f93e8a6fdb2d38b974a8229684de785 to your computer and use it in GitHub Desktop.
Save khsing/2f93e8a6fdb2d38b974a8229684de785 to your computer and use it in GitHub Desktop.
Extract articles from html which downloaded from archive.org
#!/usr/bin/env python
# coding: utf-8
import os
import sys
from datetime import datetime
from bs4 import BeautifulSoup,Tag
def find_all_html(path):
for root, dirs, files in os.walk(path):
for f in files:
if f[-4:] == "html":
yield os.path.join(root,f)
def extract_articles(content):
soup = BeautifulSoup(content, features="lxml")
if soup:
generator = soup.find_all(name="meta",attrs={'name':"generator"})
if generator:
g = generator[0]["content"].lower()
if g.startswith("wordpress"):
# pass
for i in extract_from_wp(soup):
yield i
elif g.startswith("movable type"):
# pass
for i in extract_from_mt(soup):
yield i
def extract_from_wp(soup):
for article in soup.find_all("article", class_="post"):
yield parse_wp_acticle(article)
def extract_from_mt(soup):
for article in soup.find_all("div",class_="entry-asset"):
yield parse_mt_acticle(article)
def parse_mt_acticle(soup):
acticle_title = soup.h1.get_text() if soup.h1 else soup.h2.get_text()
article_basename = soup.find('span',attrs={'class':'separator'}).next_sibling.next_sibling.get('href').split('/')[-1].split('.')[0]
acticle_id = soup.get('id').split('-')[1]
article_published_datetime = soup.find('abbr', attrs={'class':'published'}).get('title')
article_body = ''.join([extract(i) for i in soup.find('div', attrs={'class':'asset-body'}).contents])
return {
'title': acticle_title,
'basename': article_basename,
'id': acticle_id,
'published_time': convert_time(article_published_datetime),
'body': article_body,
'generator':'movabletype',
}
def parse_wp_acticle(soup):
acticle_title = soup.h1.get_text() if soup.h1 else soup.h2.get_text()
article_basename = soup.find('a',attrs={'rel':'bookmark'}).get('href').split('/')[-1].split('.')[0]
acticle_id = soup.get('id').split('-')[1]
article_published_datetime = soup.find('time', attrs={'class':'published'}).get('datetime')
article_body = ''.join([extract(i) for i in soup.find('div', attrs={'class':'entry-content'}).contents])
return {
'title': acticle_title,
'basename': article_basename,
'id': acticle_id,
'published_time': convert_time(article_published_datetime),
'body': article_body,
'generator':'wordpress'
}
def convert_time(timestr):
return datetime.fromisoformat(timestr)
def mt_output(artcle):
return """TITLE: {title}
BASENAME: {basename}
AUTHOR: Guixing
DATE: {published_time:%m/%d/%Y %I:%M:%S %p}
PRIMARY CATEGORY: Backup
CATEGORY: Backup
-----
BODY:
{body}
""".format(**artcle)
def extract(item):
if isinstance(item, Tag):
return repr(item)
else:
return item
if __name__ == "__main__":
f = sys.argv[1]
count=0
result = []
for i in find_all_html(f):
count += 1
for article in extract_articles(open(i)):
result.append(mt_output(article))
print("--------\n".join(result))
print("--------\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment