Skip to content

Instantly share code, notes, and snippets.

hamletbatista

Block or report user

Report or block hamletbatista

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View bert_model_definition.py
#https://uber.github.io/ludwig/user_guide/#bert-encoder
template="""
input_features:
-
name: Questions
type: text
encoder: bert
config_path: uncased_L-12_H-768_A-12/bert_config.json
checkpoint_path: uncased_L-12_H-768_A-12/bert_model.ckpt
View crawl_delay_pe.py
#first convert the dates to datetime objects
crawled["lastmod"] = pd.to_datetime(crawled["lastmod"])
crawled["date"] = pd.to_datetime(crawled["date"])
crawled["crawl_delay"] = crawled["date"] - crawled["lastmod"]
View crawled_notcrawled_pe.py
merged = pd.merge(df, df_logs, right_on="path", left_on="path", how="left")
#pages not crawled
notcrawled=merged[["path", "lastmod", "date"]][merged.date.isnull()]
notcrawled.to_csv("notcrawled.csv")
#pages crawled
crawled = merged[["lastmod", "date", "path"]].dropna()
crawled.to_csv("crawled.csv")
View load_pe_logs_to_df.py
df_logs = pd.read_csv("practicalecommerce.com-ssl_log-Jul-2019.csv")
df_logs.head()
View convert_xml_sitemaps_csv.py
sitemap_index_url="https://www.practicalecommerce.com/sitemapindex.xml"
from bs4 import BeautifulSoup
import requests
sitemap_index = {}
r = requests.get(sitemap_index_url)
xml = r.text
View head_log_pe.sh
!head practicalecommerce.com-ssl_log-Jul-2019.csv
#ip,date,url,path,file_extension,query,status_code,ua
#66.249.69.196,2019-06-30T08:05:31+00:00,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy-,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy-,,,301,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,2019-06-30T08:05:32+00:00,/category/design-development,/category/design-development,,,200,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,2019-06-30T08:05:32+00:00,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy,,,200,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,2019-06-30T08:05:38+00:00,/wp-content/uploads/2015/05/practical-ecommerce-icon.png-144,/wp-content/uploads/2015/05/practical-ecommerce-icon.png-144,png-144,,404,Googlebot-Image/1.0
#66.249.69.200,2019-06-30T08:06:23+00:00,/real-benefit-amazon-reviews?amp%2525252525252525253BlastReferrer=www.avalara.c
View process_pe_logs.py
from urllib.parse import urlparse
from datetime import timezone
#Convert log to CSV
csvfile = "practicalecommerce.com-ssl_log-Jul-2019.csv"
logfile="practicalecommerce.com-ssl_log-Jul-2019.log"
View verify_googlebot.py
import socket
def verify_googlebot(bot_ip):
try:
host = socket.gethostbyaddr(bot_ip)
bot_name = host[0]
except:
#no PTR record
#print(bot_ip)
View pe_logs_regex.py
#https://regex101.com/r/ElmF2y/2/
p= r'^(\S+) \S+ \S+ \[([^\]]+)\] "[A-Z]+\s([^\s]+) [^"]+" (\d+) \d+ "[^"]*" "([^"]*)"$'
#example CSV output
#ip,date,url,status_code,ua
#66.249.69.196,30/Jun/2019:08:05:31 -0400,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy-,301,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,30/Jun/2019:08:05:32 -0400,/category/design-development,200,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,30/Jun/2019:08:05:32 -0400,/Crawl-Your-Ecommerce-Site-with-Python-Scrapy,200,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#66.249.69.196,30/Jun/2019:08:05:38 -0400,/wp-content/uploads/2015/05/practical-ecommerce-icon.png-144,404,Googlebot-Image/1.0
View upload_pe_googlecolab.py
from google.colab import files
files.upload()
#practicalecommerce.com-ssl_log-Jul-2019.log.gz(application/x-gzip) - 6663880 bytes, last modified: 7/20/2019 - 100% done
# Saving practicalecommerce.com-ssl_log-Jul-2019.log.gz to practicalecommerce.com-ssl_log-Jul-2019.log.gz
#{'practicalecommerce.com-ssl_log-Jul-2019.log.gz': b'practicalecommerce.com-ssl_log-Jul-2019.log'}
!gunzip practicalecommerce.com-ssl_log-Jul-2019.log.gz
You can’t perform that action at this time.