Skip to content

Instantly share code, notes, and snippets.

@pbamotra
Created April 28, 2020 01:18
Show Gist options
  • Save pbamotra/757958117afbd98172553b4b42af3ce6 to your computer and use it in GitHub Desktop.
Save pbamotra/757958117afbd98172553b4b42af3ce6 to your computer and use it in GitHub Desktop.
amazon-books-wishlist-to-pandas
import datetime
import glob
from lxml import etree
import pandas as pd
def get_books(file):
doc = etree.HTMLParser()
tree = etree.parse(file, parser=doc)
books = tree.xpath('/html/body/div[1]/div/table/tbody')[0]
lbooks = []
for row in books.iterchildren():
lbooks += row.xpath('td/*/text()')[:1]
return lbooks
def run():
files = glob.glob('Books I want to read*.html')
allbooks = []
for f in files:
allbooks += get_books(f)
today = datetime.datetime.now().__str__()[:10]
df = pd.DataFrame(allbooks, columns=['book_name'])
df.to_csv(f'amazon.{today}.amznbkup.csv', index=False)
prev_backups = glob.glob('*.amznbkup.csv')
latest = pd.concat([pd.read_csv(f) for f in prev_backups]).drop_duplicates()
df.to_csv('amazon.latest.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment