Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
@hamletbatista
hamletbatista / rangediff
Created November 9, 2017 21:54
A custom Google Sheets function that compares two data sets with a pair of related values, one textual and one numeric
function is_valid_range_(r) {
return (r.length > 0 && r[0].length === 2);
}
function get_range_values_(r) {
var values = []
for(var i=0;i<r.length;i++) {
values.push({ text: r[i][0].trim(), count: r[i][1] });
}
return values;
@hamletbatista
hamletbatista / read_xml_sitemaps.py
Created February 27, 2019 21:51
Read URLs from XML Sitemap
sitemaps = {}
for (sitemap_url, lasmod) in sitemap_index.items():
if(sitemap_url.find("post") > 0):
print(sitemap_url)
if 1: # for testing
r = requests.get(sitemap_url)
xml = r.text
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from urllib.parse import urlparse
cnt=Counter()
english_stopwords = set(stopwords.words('english'))
for path in df.path:
words = re.split("[-/]", path)
for word in words:
if len(word) > 0 and word not in english_stopwords and not word.isdigit():
cnt[word] += 1
cnt.most_common(25)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
word_cloud = [x[0] for x in cnt.most_common(25)]
word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))
#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background
plt.imshow(word_cloud_obj, interpolation='bilinear')
def get_category(path):
words = re.split("[-/]", path)
for word in words:
if word in word_cloud:
return word
return "other"
google_df = df[df["category"] == "google"]
first = google_df[:1000]
second = google_df[1000:2000]
third = google_df[2000:3000]
last = google_df[3000:]
high_value_pages=df[df.path.str.contains("adwords|facebook|ads|media", regex=True)]
import numpy as np
high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])
high_value_pages=high_value_pages.reset_index()
fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]
from jinja2 import Template
sitemap_template="""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for page in pages %}
<url>
<loc>{{page[0]|safe}}</loc>
<lastmod>{{page[1]}}</lastmod>
</url>
{% endfor %}
@hamletbatista
hamletbatista / crawl_redirects.py
Created February 28, 2019 21:58
crawl redirects
def get_redirects(url):
try:
# r = requests.get(url)
r = requests.head(url)
except:
return (url, None, "Error")
if r.status_code in [301, 302, 307]:
return (url, r.status_code, r.headers['Location'])
elif r.status_code == 404:
return (url, r.status_code, None)