Skip to content

Instantly share code, notes, and snippets.

Hamlet Batista hamletbatista

Block or report user

Report or block hamletbatista

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
@hamletbatista
hamletbatista / rangediff
Created Nov 9, 2017
A custom Google Sheets function that compares two data sets with a pair of related values, one textual and one numeric
View rangediff
function is_valid_range_(r) {
return (r.length > 0 && r[0].length === 2);
}
function get_range_values_(r) {
var values = []
for(var i=0;i<r.length;i++) {
values.push({ text: r[i][0].trim(), count: r[i][1] });
}
return values;
@hamletbatista
hamletbatista / read_xml_sitemaps.py
Created Feb 27, 2019
Read URLs from XML Sitemap
View read_xml_sitemaps.py
sitemaps = {}
for (sitemap_url, lasmod) in sitemap_index.items():
if(sitemap_url.find("post") > 0):
print(sitemap_url)
if 1: # for testing
r = requests.get(sitemap_url)
xml = r.text
View prepare_word_cloud.py
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from urllib.parse import urlparse
View create_word_cloud.py
cnt=Counter()
english_stopwords = set(stopwords.words('english'))
for path in df.path:
words = re.split("[-/]", path)
for word in words:
if len(word) > 0 and word not in english_stopwords and not word.isdigit():
cnt[word] += 1
cnt.most_common(25)
View create_visual_word_cloud.py
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
word_cloud = [x[0] for x in cnt.most_common(25)]
word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))
#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background
plt.imshow(word_cloud_obj, interpolation='bilinear')
View add_word_cloud_to_df.py
def get_category(path):
words = re.split("[-/]", path)
for word in words:
if word in word_cloud:
return word
return "other"
View splitting_google_urls_by_1k.py
google_df = df[df["category"] == "google"]
first = google_df[:1000]
second = google_df[1000:2000]
third = google_df[2000:3000]
last = google_df[3000:]
View fake_transaction_pages.py
high_value_pages=df[df.path.str.contains("adwords|facebook|ads|media", regex=True)]
import numpy as np
high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])
high_value_pages=high_value_pages.reset_index()
fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]
View write_xml_sitemap.py
from jinja2 import Template
sitemap_template="""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for page in pages %}
<url>
<loc>{{page[0]|safe}}</loc>
<lastmod>{{page[1]}}</lastmod>
</url>
{% endfor %}
View crawl_redirects.py
def get_redirects(url):
try:
# r = requests.get(url)
r = requests.head(url)
except:
return (url, None, "Error")
if r.status_code in [301, 302, 307]:
return (url, r.status_code, r.headers['Location'])
elif r.status_code == 404:
return (url, r.status_code, None)
You can’t perform that action at this time.