Skip to content

Instantly share code, notes, and snippets.

@jasonbot
Last active May 10, 2023 23:16
Show Gist options
  • Save jasonbot/4c2a3eddd27dff2dc497a06d1071f2f6 to your computer and use it in GitHub Desktop.
Save jasonbot/4c2a3eddd27dff2dc497a06d1071f2f6 to your computer and use it in GitHub Desktop.
Consume a common crawn (WARC) file to parquet table
import gzip
import os
import pathlib
import warcio.archiveiterator
from bs4 import BeautifulSoup
import pandas
def warc_stream(stream):
uri: str | None = None
for record in warcio.archiveiterator.ArchiveIterator(stream):
uri = record.rec_headers.get("WARC-Target-URI")
cs = record.content_stream().read()
try:
cs = gzip.decompress(cs)
except:
pass
soup = BeautifulSoup(cs, "html.parser")
try:
if uri:
yield {"url": uri, "text": (soup.getText() or "")}
except Exception as e:
print(f" URI {uri} was not an HTML stream: {e}")
uri = None
files = pathlib.Path(".").glob("*.warc.gz")
for filename in files:
print(filename)
with open(filename, "rb") as stream:
df = pandas.DataFrame(data=warc_stream(stream))
df.to_parquet(os.path.basename(filename) + ".parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment