This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import datetime | |
import pandas as pd | |
today = datetime.datetime.now(datetime.UTC).strftime('%Y_%m_%d') | |
url_status_time = pd.concat( | |
pd.read_json(f'/path/to/status_codes/{file}', | |
lines=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import advertools as adv | |
today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d") | |
sitemap = adv.sitemap_to_df("https://example/sitemap.xml") | |
adv.crawl_headers( | |
sitemap["loc"], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
xpath = pd.read_html('https://advertools.readthedocs.io/en/master/advertools.code_recipes.spider_strategies.html') | |
df = xpath[2].iloc[:, [1, 2]] | |
df = df.assign(expression=[f'=textjoin("@@",100,IMPORTXML(A2,"{expression}"))' for expression in df['XPath Expression']]) | |
display(df[['Suggested Name', 'expression']].T) | |
df[['Suggested Name', 'expression']].T.to_clipboard(index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create anchors for all letters: | |
import string | |
print('<h3>' + ' '.join([f'<a href="#{letter}">{letter}</a>' for letter in string.ascii_uppercase]) + '</h3>') | |
# Assuming you have a DataFrame with the columns "full_name" and "loc": | |
for letter in string.ascii_uppercase: | |
df = players_df[players_static['full_name'].str[0].eq(letter)] | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import pandas as pd | |
lastmod = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d') | |
url_list = [ | |
'https://nbastats.pro/player/Eric_Moreland', | |
'https://nbastats.pro/player/Marc_Iavaroni', | |
'https://nbastats.pro/player/Keith_Tower', | |
'https://nbastats.pro/player/Hakeem_Olajuwon', | |
'https://nbastats.pro/player/Mike_Price', | |
'https://nbastats.pro/player/Doug_Collins', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from subprocess import run | |
from functools import partial | |
run = partial(run, text=True, capture_output=True) | |
def running_crawls(): | |
"""Get details of currently running spiders. | |
Get a DataFrame showing the following details: | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
url_xpath_selectors = { | |
'https://www.ft.com': ('main_story_headline', '//span[contains(@class, "text text--color-black text-display--scale-7 text--weight-500")]/text()'), | |
'https://www.nytimes.com': ('main_story_headline', '//h3[@class="indicate-hover css-si8ren"]/text()'), | |
'https://www.economist.com': ('main_story_headline', '//a[@data-analytics="top_stories:headline_1"]/text()'), | |
'https://edition.cnn.com': ('main_story_headline', '//h2[@class="container__title_url-text container_lead-package__title_url-text"]/text()'), | |
'https://www.nbcnews.com': ('main_story_headline', '//h2[@class="storyline__headline founders-cond fw6 important large headlineOnly"]/text()'), | |
'https://www.bbc.com': ('main_story_headline', '//a[@rev="hero1|headline"]/text()'), | |
'https://www.foxnews.com': ('main_story_headline', '(//header[@class="info-header"])[1]//a/text()'), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
adv.crawl( | |
# start crawling from this URL(s): | |
url_list='https://en.wikipedia.org/wiki/Main_Page', | |
# save the crawl output to this file: | |
output_file='/home/user_name/wikipedia_en_crawl.jl', | |
# Should it follow links? | |
follow_links=True, | |
# But don't follow all links, only links that match this regex: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jl_to_parquet(jl_filepath, parquet_filepath): | |
"""Convert a jsonlines crawl file to the parquet format. | |
Parameters | |
---------- | |
jl_filepath : str | |
The path of an existing .jl file. | |
parquet_fileapth : str | |
The pather where you want the new file to be saved (ending with .parquet). | |
""" |
NewerOlder