Hamlet Batista hamletbatista

## rangediff
function is_valid_range_(r) {
  return (r.length > 0 && r[0].length === 2);
}

function get_range_values_(r) {
  var values = []
  for(var i=0;i<r.length;i++) {
    values.push({ text: r[i][0].trim(), count: r[i][1] });
  }
  return values;

## read_xml_sitemaps.py
sitemaps = {}

for (sitemap_url, lasmod) in sitemap_index.items():
  if(sitemap_url.find("post") > 0):
    print(sitemap_url)

    if 1: # for testing
      r = requests.get(sitemap_url)
      xml = r.text

## prepare_word_cloud.py
from collections import Counter
import re

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

from urllib.parse import urlparse

## create_word_cloud.py
cnt=Counter()
english_stopwords = set(stopwords.words('english'))

for path in df.path:
  words = re.split("[-/]", path)
  for word in words:
    if len(word) > 0 and word not in english_stopwords and not word.isdigit():
      cnt[word] += 1

cnt.most_common(25)

## create_visual_word_cloud.py
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

word_cloud = [x[0] for x in cnt.most_common(25)]

word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))

#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background

plt.imshow(word_cloud_obj, interpolation='bilinear')

## add_word_cloud_to_df.py
def get_category(path):
  words = re.split("[-/]", path)

  for word in words:
    if word in word_cloud:
      return word

  return "other"


## splitting_google_urls_by_1k.py
google_df = df[df["category"] == "google"]

first = google_df[:1000]
second = google_df[1000:2000]
third = google_df[2000:3000]
last = google_df[3000:]


## fake_transaction_pages.py
high_value_pages=df[df.path.str.contains("adwords|facebook|ads|media", regex=True)]

import numpy as np

high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])

high_value_pages=high_value_pages.reset_index()

fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]

## write_xml_sitemap.py
from jinja2 import Template

sitemap_template="""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    {% for page in pages %}
    <url>
        <loc>{{page[0]|safe}}</loc>
        <lastmod>{{page[1]}}</lastmod>
    </url>
    {% endfor %}

## crawl_redirects.py
def get_redirects(url):
  try:
#     r = requests.get(url)
    r = requests.head(url)
  except:
    return (url, None, "Error")
  if r.status_code in [301, 302, 307]:
    return (url, r.status_code, r.headers['Location'])
  elif r.status_code == 404:
    return (url, r.status_code, None)
	function is_valid_range_(r) {
	return (r.length > 0 && r[0].length === 2);
	}

	function get_range_values_(r) {
	var values = []
	for(var i=0;i<r.length;i++) {
	values.push({ text: r[i][0].trim(), count: r[i][1] });
	}
	return values;
	sitemaps = {}

	for (sitemap_url, lasmod) in sitemap_index.items():
	if(sitemap_url.find("post") > 0):
	print(sitemap_url)

	if 1: # for testing
	r = requests.get(sitemap_url)
	xml = r.text
	from collections import Counter
	import re

	import nltk
	from nltk.corpus import stopwords

	nltk.download('stopwords')

	from urllib.parse import urlparse
	cnt=Counter()
	english_stopwords = set(stopwords.words('english'))

	for path in df.path:
	words = re.split("[-/]", path)
	for word in words:
	if len(word) > 0 and word not in english_stopwords and not word.isdigit():
	cnt[word] += 1

	cnt.most_common(25)
	from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
	import matplotlib.pyplot as plt

	word_cloud = [x[0] for x in cnt.most_common(25)]

	word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))

	#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background

	plt.imshow(word_cloud_obj, interpolation='bilinear')
	def get_category(path):
	words = re.split("[-/]", path)

	for word in words:
	if word in word_cloud:
	return word

	return "other"
	google_df = df[df["category"] == "google"]

	first = google_df[:1000]
	second = google_df[1000:2000]
	third = google_df[2000:3000]
	last = google_df[3000:]
	high_value_pages=df[df.path.str.contains("adwords\|facebook\|ads\|media", regex=True)]

	import numpy as np

	high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])

	high_value_pages=high_value_pages.reset_index()

	fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]
	from jinja2 import Template

	sitemap_template="""<?xml version="1.0" encoding="UTF-8"?>
	<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	{% for page in pages %}
	<url>
	<loc>{{page[0]\|safe}}</loc>
	<lastmod>{{page[1]}}</lastmod>
	</url>
	{% endfor %}
	def get_redirects(url):
	try:
	# r = requests.get(url)
	r = requests.head(url)
	except:
	return (url, None, "Error")
	if r.status_code in [301, 302, 307]:
	return (url, r.status_code, r.headers['Location'])
	elif r.status_code == 404:
	return (url, r.status_code, None)