Skip to content

Instantly share code, notes, and snippets.

@Marinell0
Last active April 17, 2024 14:48
Show Gist options
  • Save Marinell0/98e141ac3c01cac9254cb325cf97f210 to your computer and use it in GitHub Desktop.
Save Marinell0/98e141ac3c01cac9254cb325cf97f210 to your computer and use it in GitHub Desktop.
Filter lines from a csv file inside a zip. It does this in an efficient way so to not waste time on doing this.
import pandas as pd
import zipfile
import io
import re
from typing import Iterator
def rows_with_index(pattern, sep, file) -> Iterator[str]:
row_index = 0
for row in io.TextIOWrapper(file):
if pattern.match(row):
yield f"{row_index}{sep}{row}"
row_index += 1
def filter_lines_from_zipped_csv(path: str, pattern: re.Pattern[str], sep: str = ",", column_names: list[str] | None = None) -> pd.DataFrame:
"""
Filter lines from a zipped csv file (with one file inside) using a regex pattern.
The pattern needs to be compiled generating the re.Pattern object.
If no lines are found, an empty dataframe is returned.
Parameters
----------
path: str
path to the zipped csv file
pattern: re.Pattern[str]
compiled regex pattern
sep: str, default :
separator of the csv file
"""
with zipfile.ZipFile(path, 'r') as zip_file:
with zip_file.open(zip_file.namelist()[0]) as file:
rows = rows_with_index(pattern, sep, file)
return pd.read_csv(io.StringIO("\n".join(rows)), sep=sep, index_col=0, header=None, names=column_names, lineterminator='\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment