Skip to content

Instantly share code, notes, and snippets.

@harry-stark
Created December 6, 2022 22:40
Show Gist options
  • Save harry-stark/4619edd4b7f2d8dd00590384dfcc08ef to your computer and use it in GitHub Desktop.
Save harry-stark/4619edd4b7f2d8dd00590384dfcc08ef to your computer and use it in GitHub Desktop.
import pandas as pd
from urllib.parse import urlparse
df=pd.read_csv()
def url_matches_dataframe(url: str, df: pd.DataFrame) -> bool:
# Parse the given URL to get the netloc and hostname
parsed_url = urlparse(url)
netloc = parsed_url.netloc
hostname = parsed_url.hostname
# Remove "www" from the netloc and hostname
netloc = netloc.replace("www.", "")
hostname = hostname.replace("www.", "")
# Use the `isin` method to check if the given netloc or hostname is in the dataframe
return df['netloc'].isin([netloc]).any() | df['hostname'].isin([hostname]).any()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment