Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://nbastats.pro

Elias Dabbas eliasdabbas

💭
https://nbastats.pro
View GitHub Profile
@eliasdabbas
eliasdabbas / filter_non_200_status_codes.py
Created June 2, 2024 19:17
Filter non 200 status codes on a daily basis
import os
import datetime
import pandas as pd
today = datetime.datetime.now(datetime.UTC).strftime('%Y_%m_%d')
url_status_time = pd.concat(
pd.read_json(f'/path/to/status_codes/{file}',
lines=True)
@eliasdabbas
eliasdabbas / daily_status_code.py
Created June 1, 2024 20:00
Setting up a daily status code checker
import datetime
import advertools as adv
today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d")
sitemap = adv.sitemap_to_df("https://example/sitemap.xml")
adv.crawl_headers(
sitemap["loc"],
@eliasdabbas
eliasdabbas / create_list_crawler_for_sheets.py
Created May 22, 2024 11:19
Create and SEO crawler in list mode working on Google Sheets
import pandas as pd
xpath = pd.read_html('https://advertools.readthedocs.io/en/master/advertools.code_recipes.spider_strategies.html')
df = xpath[2].iloc[:, [1, 2]]
df = df.assign(expression=[f'=textjoin("@@",100,IMPORTXML(A2,"{expression}"))' for expression in df['XPath Expression']])
display(df[['Suggested Name', 'expression']].T)
df[['Suggested Name', 'expression']].T.to_clipboard(index=False)
@eliasdabbas
eliasdabbas / url_to_html_sitemap.py
Last active April 16, 2024 10:03
Create an HTML sitemap for a list of URLs and their anchor text
# Create anchors for all letters:
import string
print('<h3>' + '&nbsp;&nbsp;&nbsp;&nbsp;'.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>')
# Assuming you have a DataFrame with the columns "full_name" and "loc":
for letter in string.ascii_uppercase:
df = players_df[players_static['full_name'].str[0].eq(letter)]
print()
@eliasdabbas
eliasdabbas / url_to_xml_sitemap.py
Created April 15, 2024 12:15
Convert a list of URLs to an XML sitemap
import datetime
import pandas as pd
lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d')
url_list = [
'https://nbastats.pro/player/Eric_Moreland',
'https://nbastats.pro/player/Marc_Iavaroni',
'https://nbastats.pro/player/Keith_Tower',
'https://nbastats.pro/player/Hakeem_Olajuwon',
'https://nbastats.pro/player/Mike_Price',
'https://nbastats.pro/player/Doug_Collins',
@eliasdabbas
eliasdabbas / running_crawls.py
Last active November 2, 2023 13:33
Get a summary of the currently running crawl jobs (using the advertools crawler)
from subprocess import run
from functools import partial
run = partial(run, text=True, capture_output=True)
def running_crawls():
"""Get details of currently running spiders.
Get a DataFrame showing the following details:
@eliasdabbas
eliasdabbas / news_headlines_automated.py
Created October 21, 2023 14:14
Get the main headline story from the homepages of new websites.
import advertools as adv
url_xpath_selectors = {
'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'),
'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'),
'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'),
'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'),
'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'),
'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1|headline"]/text()'),
'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'),
@eliasdabbas
eliasdabbas / incremental_crawling.py
Last active October 20, 2023 12:42
Incremental crawling with advertools. Crawl a set number of pages every time without re-crawling the same pages
import advertools as adv
adv.crawl(
# start crawling from this URL(s):
url_list='https://en.wikipedia.org/wiki/Main_Page',
# save the crawl output to this file:
output_file='/home/user_name/wikipedia_en_crawl.jl',
# Should it follow links?
follow_links=True,
# But don't follow all links, only links that match this regex:
@eliasdabbas
eliasdabbas / meta_tags.py
Created August 12, 2023 11:31
Get all meta tags of a selected URL (every tags under the <head> section of the page)
import requests
from bs4 import BeautifulSoup
import pandas as pd
def meta_tags(url, get_text=['title']):
"""Get all tags under the <head> of `url` with all attributes and values.
This is mainly for exploratory purposes, to discover what is available,
and if there are errors. If you know which tags/attributes you want beforehand
you can easily get them with custom extraction (CSS/XPath selectors).
@eliasdabbas
eliasdabbas / jl_to_parquet.py
Created July 16, 2023 11:01
Convert a jsonlines file to a compressed parquet file (if JSON object have different types e.g. list and scalar in the same column, it converts them to strings)
def jl_to_parquet(jl_filepath, parquet_filepath):
"""Convert a jsonlines crawl file to the parquet format.
Parameters
----------
jl_filepath : str
The path of an existing .jl file.
parquet_fileapth : str
The pather where you want the new file to be saved (ending with .parquet).
"""