Skip to content

Instantly share code, notes, and snippets.

View DerekHawkins's full-sized avatar

DerekHawkins

View GitHub Profile
@DerekHawkins
DerekHawkins / core_web_vitals_analysis_example.py
Created June 1, 2020 13:47
core_web_vitals_analysis_example.py
import pandas as pd
import requests
import urllib
import time
import re
# Data Visualization
from plotly import tools
import chart_studio
@DerekHawkins
DerekHawkins / htz_search_comparison.py
Created June 10, 2020 18:19
Example of how to take search interest around multiple keywords and align it to stock activity
import pandas as pd
from time import sleep
from random import randint
from tqdm import notebook as tqdm
### Import Modules and Set Perimeters for Pytrends ###
from pytrends.request import TrendReq
### For Ticker Information
import yfinance as yf
### Import Libraries ###
import urllib
import requests
import pandas as pd
from tqdm.notebook import tqdm
from urllib.parse import urlparse
### Load API Key
api_key = ''
term = input('What keyword would you like to explore?')
df1 = pd.DataFrame(parse_response(requests.get(build_seo_urls(phrase=term)).content))
try:
keyword_list = secondary_layer(crawl_urls=df1['Url'])
except KeyError as e:
raise Exception("The keyword you have inputted is either not in SEMrush's database or your input was incorrectly submitted. Please rerun and try again.")
third_layer = third_layer_setup(second_layer_kw=keyword_list)
third_layer = third_layer.merge(keyword_list[['Keyword','Search Volume', 'CPC', 'Competition']], on="Keyword", how='left')
def log_parse(data):
# Response Size
try:
size = re.search(r'[0-9] (\d{1,4})', data).group(1)
except AttributeError as e:
size = 'n/a'
# Server Response
try:
server_response = re.search(r'http.*?[\"]', data).group(0).replace('"', '')
# Approach takes into consideration multiple log files stored locally with a .gz format
log_file_slugs = os.listdir(r'C:\Users\Derek.Hawkins\Log File Analysis Folder')
log_file_slugs = [x for x in log_file_slugs if '.gz' in x]
main = []
for i in log_file_slugs:
with gzip.open(r'C:\Users\Derek.Hawkins\Log File Analysis\{}'.format(i),'r') as fin:
for line in tqdm(fin):
try:
main.append(log_parse(line.decode()))
except AttributeError as e:
import socket
log_file = pd.read_pickle('log.pkl')
log_file = log_file.ip_address.apply(lambda ip: socket.gethostbyaddr(ip)[0])
# Alternative
from crawlerdetect import CrawlerDetect
crawler_detect = CrawlerDetect()
validate = []
for crawl in log_file.user_agent:
data = {'valid':crawler_detect.isCrawler(crawl),
import plotly.graph_objects as go
pivot = log_file.pivot_table(index='status_code', values='server_response',
aggfunc={'server_response':'count'})
fig = go.Figure(data=[go.Pie(labels=pivot.index, values=pivot.server_response)])
fig.show()
pivot = log_file.pivot_table(index='request_type', values='server_response',
aggfunc={'server_response':'count'})\
.sort_values(by='server_response', ascending=False).reset_index().\
rename(columns={'request_type':'file_name', 'server_response':'number_of_pings'})
fig = go.Figure(data=[go.Table(
header=dict(values=list(pivot.columns),
fill_color='blue',
font=dict(color='white', size=12),
align='left'),
pivot = log_file.pivot_table(index='date', values='server_response',
aggfunc={'server_response':'count'}).rename(columns={'server_response':'crawls'})
fig = go.Figure(data=go.Scatter(x=pivot.index, y=pivot.crawls, mode='lines'))
fig.update_layout(title='Crawl Rate: example.com',
xaxis_title='Date',
yaxis_title='Number of Pings by Search Engine')
fig.show()