Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://bit.ly/100DaysOfCode_elias

Elias Dabbas eliasdabbas

💭
https://bit.ly/100DaysOfCode_elias
View GitHub Profile
@eliasdabbas
eliasdabbas / url_to_html_sitemap.py
Last active April 16, 2024 10:03
Create an HTML sitemap for a list of URLs and their anchor text
# Create anchors for all letters:
import string
print('<h3>' + '&nbsp;&nbsp;&nbsp;&nbsp;'.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>')
# Assuming you have a DataFrame with the columns "full_name" and "loc":
for letter in string.ascii_uppercase:
df = players_df[players_static['full_name'].str[0].eq(letter)]
print()
@eliasdabbas
eliasdabbas / url_to_xml_sitemap.py
Created April 15, 2024 12:15
Convert a list of URLs to an XML sitemap
import datetime
import pandas as pd
lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d')
url_list = [
'https://nbastats.pro/player/Eric_Moreland',
'https://nbastats.pro/player/Marc_Iavaroni',
'https://nbastats.pro/player/Keith_Tower',
'https://nbastats.pro/player/Hakeem_Olajuwon',
'https://nbastats.pro/player/Mike_Price',
'https://nbastats.pro/player/Doug_Collins',
@eliasdabbas
eliasdabbas / running_crawls.py
Last active November 2, 2023 13:33
Get a summary of the currently running crawl jobs (using the advertools crawler)
from subprocess import run
from functools import partial
run = partial(run, text=True, capture_output=True)
def running_crawls():
"""Get details of currently running spiders.
Get a DataFrame showing the following details:
@eliasdabbas
eliasdabbas / news_headlines_automated.py
Created October 21, 2023 14:14
Get the main headline story from the homepages of new websites.
import advertools as adv
url_xpath_selectors = {
'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'),
'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'),
'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'),
'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'),
'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'),
'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1|headline"]/text()'),
'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'),
@eliasdabbas
eliasdabbas / incremental_crawling.py
Last active October 20, 2023 12:42
Incremental crawling with advertools. Crawl a set number of pages every time without re-crawling the same pages
import advertools as adv
adv.crawl(
# start crawling from this URL(s):
url_list='https://en.wikipedia.org/wiki/Main_Page',
# save the crawl output to this file:
output_file='/home/user_name/wikipedia_en_crawl.jl',
# Should it follow links?
follow_links=True,
# But don't follow all links, only links that match this regex:
@eliasdabbas
eliasdabbas / meta_tags.py
Created August 12, 2023 11:31
Get all meta tags of a selected URL (every tags under the <head> section of the page)
import requests
from bs4 import BeautifulSoup
import pandas as pd
def meta_tags(url, get_text=['title']):
"""Get all tags under the <head> of `url` with all attributes and values.
This is mainly for exploratory purposes, to discover what is available,
and if there are errors. If you know which tags/attributes you want beforehand
you can easily get them with custom extraction (CSS/XPath selectors).
@eliasdabbas
eliasdabbas / jl_to_parquet.py
Created July 16, 2023 11:01
Convert a jsonlines file to a compressed parquet file (if JSON object have different types e.g. list and scalar in the same column, it converts them to strings)
def jl_to_parquet(jl_filepath, parquet_filepath):
"""Convert a jsonlines crawl file to the parquet format.
Parameters
----------
jl_filepath : str
The path of an existing .jl file.
parquet_fileapth : str
The pather where you want the new file to be saved (ending with .parquet).
"""
@eliasdabbas
eliasdabbas / tableviz.py
Created July 15, 2023 10:01
Visualizing tables with Plotly
import adviz
import plotly.express as px
from plotly.subplots import make_subplots
def category_to_color(categories, colorscale='D3'):
colorscale = eval(f'px.colors.qualitative.{colorscale}')
cat_dict = dict(enumerate(set(categories)))
cat_dict = {v: colorscale[k] for k, v in cat_dict.items()}
return [cat_dict[cat] for cat in categories]
@eliasdabbas
eliasdabbas / crawl_link_summary.py
Created July 14, 2023 08:38
Organize links in an advertools crawl DataFrame
@eliasdabbas
eliasdabbas / redirect_chains.py
Created July 9, 2023 21:52
Get redirect chains from an advertools crawl dataset
def redirect_chains(crawldf):
"""Create a tidy DataFrame for redirects with the columns:
url: All the URLs in the redirect chain.
status: The status code of each URL.
type: "requested", "inermediate", or "crawled".
order: 1, 2, 3... up to the number of urls in the redirect chain.
redirect_times: The number of redirects in the chain (URLs in the chain minus one).
"""
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']]