Skip to content

Instantly share code, notes, and snippets.

@YuzuRyo61
Created February 26, 2024 03:08
Show Gist options
  • Save YuzuRyo61/7e1ded0c92b8b5a22fb1600f6ac03535 to your computer and use it in GitHub Desktop.
Save YuzuRyo61/7e1ded0c92b8b5a22fb1600f6ac03535 to your computer and use it in GitHub Desktop.
Pocketでエクスポートしたデータを、CSVに加工するPythonスクリプト
import sys
import os
import datetime
import csv
from html.parser import HTMLParser
EXPORTED_FILE_NAME = "ril_export.html"
OUTPUT_FILE_NAME = "ril_export_csv.csv"
class RilExportHTMLParser(HTMLParser):
"""
ril_export.html のaタグについて:
hrefにはpocketで保存したURL
time_addedにはUNIX時間で追加された時間が入力されている
tagsにはタグが追加されるが、使ったことがないのでわからない
"""
def __init__(self):
HTMLParser.__init__(self)
self.title = False
self.href = False
self.data = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
attrs_dict = dict(attrs)
if tag == "li":
self.data.append({})
self.title = True
self.href = True
if tag == "a" and self.href == True:
self.data[-1].update({
"url": attrs_dict.get("href"),
"added": datetime.datetime.fromtimestamp(
int(attrs_dict.get("time_added", "0"))) if attrs_dict.get("time_added", None) is not None else None,
})
def handle_data(self, data):
if self.title == True or self.href == True:
self.data[-1].update({"title": data})
self.title = False
self.href = False
if __name__ == "__main__":
if not os.path.exists(EXPORTED_FILE_NAME):
print(f"{EXPORTED_FILE_NAME} is not found. Abort.")
sys.exit(1)
with open(EXPORTED_FILE_NAME, mode="r", encoding="utf-8") as ef:
exported_raw_data = ef.read()
parser = RilExportHTMLParser()
parser.feed(exported_raw_data)
with open(OUTPUT_FILE_NAME, mode="w", encoding="utf-8") as of:
writer = csv.DictWriter(of, ["title", "url", "added"], lineterminator="\n")
writer.writeheader()
writer.writerows(parser.data)
print("export OK")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment