Elias Dabbas eliasdabbas

## filter_non_200_status_codes.py
import os
import datetime

import pandas as pd

today = datetime.datetime.now(datetime.UTC).strftime('%Y_%m_%d')

url_status_time = pd.concat(
    pd.read_json(f'/path/to/status_codes/{file}',
    lines=True)

## daily_status_code.py
import datetime

import advertools as adv

today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d")

sitemap = adv.sitemap_to_df("https://example/sitemap.xml")

adv.crawl_headers(
    sitemap["loc"],

## create_list_crawler_for_sheets.py
import pandas as pd
xpath = pd.read_html('https://advertools.readthedocs.io/en/master/advertools.code_recipes.spider_strategies.html')
df = xpath[2].iloc[:, [1, 2]]
df = df.assign(expression=[f'=textjoin("@@",100,IMPORTXML(A2,"{expression}"))' for expression in df['XPath Expression']])
display(df[['Suggested Name', 'expression']].T)
df[['Suggested Name', 'expression']].T.to_clipboard(index=False)

## url_to_html_sitemap.py
# Create anchors for all letters:

import string
print('<h3>' + '&nbsp;&nbsp;&nbsp;&nbsp;'.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>')

# Assuming you have a DataFrame with the columns "full_name" and "loc":

for letter in string.ascii_uppercase:
    df = players_df[players_static['full_name'].str[0].eq(letter)]
    print()

## url_to_xml_sitemap.py
import datetime
import pandas as pd
lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d')
url_list = [
    'https://nbastats.pro/player/Eric_Moreland',
    'https://nbastats.pro/player/Marc_Iavaroni',
    'https://nbastats.pro/player/Keith_Tower',
    'https://nbastats.pro/player/Hakeem_Olajuwon',
    'https://nbastats.pro/player/Mike_Price',
    'https://nbastats.pro/player/Doug_Collins',

## running_crawls.py
from subprocess import run
from functools import partial

run = partial(run, text=True, capture_output=True)

def running_crawls():
    """Get details of currently running spiders.

    Get a DataFrame showing the following details:


## news_headlines_automated.py
import advertools as adv

url_xpath_selectors = {
    'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'),
    'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'),
    'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'),
    'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'),
    'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'),
    'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1|headline"]/text()'),
    'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'),

## incremental_crawling.py
import advertools as adv

adv.crawl(
    # start crawling from this URL(s):
    url_list='https://en.wikipedia.org/wiki/Main_Page',
    # save the crawl output to this file:
    output_file='/home/user_name/wikipedia_en_crawl.jl',
    # Should it follow links?
    follow_links=True,
    # But don't follow all links, only links that match this regex:

## meta_tags.py
import requests
from bs4 import BeautifulSoup
import pandas as pd

def meta_tags(url, get_text=['title']):
    """Get all tags under the <head> of `url` with all attributes and values.

    This is mainly for exploratory purposes, to discover what is available,
    and if there are errors. If you know which tags/attributes you want beforehand
    you can easily get them with custom extraction (CSS/XPath selectors).

## jl_to_parquet.py
def jl_to_parquet(jl_filepath, parquet_filepath):
    """Convert a jsonlines crawl file to the parquet format.

    Parameters
    ----------
    jl_filepath : str
      The path of an existing .jl file.
    parquet_fileapth : str
      The pather where you want the new file to be saved (ending with .parquet).
    """
	import os
	import datetime

	import pandas as pd

	today = datetime.datetime.now(datetime.UTC).strftime('%Y_%m_%d')

	url_status_time = pd.concat(
	pd.read_json(f'/path/to/status_codes/{file}',
	lines=True)
	import datetime

	import advertools as adv

	today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d")

	sitemap = adv.sitemap_to_df("https://example/sitemap.xml")

	adv.crawl_headers(
	sitemap["loc"],
	import pandas as pd
	xpath = pd.read_html('https://advertools.readthedocs.io/en/master/advertools.code_recipes.spider_strategies.html')
	df = xpath[2].iloc[:, [1, 2]]
	df = df.assign(expression=[f'=textjoin("@@",100,IMPORTXML(A2,"{expression}"))' for expression in df['XPath Expression']])
	display(df[['Suggested Name', 'expression']].T)
	df[['Suggested Name', 'expression']].T.to_clipboard(index=False)
	# Create anchors for all letters:

	import string
	print('<h3>' + '    '.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>')

	# Assuming you have a DataFrame with the columns "full_name" and "loc":

	for letter in string.ascii_uppercase:
	df = players_df[players_static['full_name'].str[0].eq(letter)]
	print()
	import datetime
	import pandas as pd
	lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d')
	url_list = [
	'https://nbastats.pro/player/Eric_Moreland',
	'https://nbastats.pro/player/Marc_Iavaroni',
	'https://nbastats.pro/player/Keith_Tower',
	'https://nbastats.pro/player/Hakeem_Olajuwon',
	'https://nbastats.pro/player/Mike_Price',
	'https://nbastats.pro/player/Doug_Collins',
	from subprocess import run
	from functools import partial

	run = partial(run, text=True, capture_output=True)

	def running_crawls():
	"""Get details of currently running spiders.

	Get a DataFrame showing the following details:
	import advertools as adv

	url_xpath_selectors = {
	'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'),
	'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'),
	'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'),
	'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'),
	'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'),
	'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1\|headline"]/text()'),
	'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'),
	import advertools as adv

	adv.crawl(
	# start crawling from this URL(s):
	url_list='https://en.wikipedia.org/wiki/Main_Page',
	# save the crawl output to this file:
	output_file='/home/user_name/wikipedia_en_crawl.jl',
	# Should it follow links?
	follow_links=True,
	# But don't follow all links, only links that match this regex:
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd

	def meta_tags(url, get_text=['title']):
	"""Get all tags under the <head> of `url` with all attributes and values.

	This is mainly for exploratory purposes, to discover what is available,
	and if there are errors. If you know which tags/attributes you want beforehand
	you can easily get them with custom extraction (CSS/XPath selectors).
	def jl_to_parquet(jl_filepath, parquet_filepath):
	"""Convert a jsonlines crawl file to the parquet format.

	Parameters
	----------
	jl_filepath : str
	The path of an existing .jl file.
	parquet_fileapth : str
	The pather where you want the new file to be saved (ending with .parquet).
	"""