Hamlet Batista hamletbatista

## rangediff
function is_valid_range_(r) {
  return (r.length > 0 && r[0].length === 2);
}

function get_range_values_(r) {
  var values = []
  for(var i=0;i<r.length;i++) {
    values.push({ text: r[i][0].trim(), count: r[i][1] });
  }
  return values;

## read_sitemap_index.py
sitemap_index_url="https://www.searchenginejournal.com/sitemap_index.xml"

from bs4 import BeautifulSoup
import requests

sitemap_index = {}

r = requests.get(sitemap_index_url)
xml = r.text

## read_xml_sitemaps.py
sitemaps = {}

for (sitemap_url, lasmod) in sitemap_index.items():
  if(sitemap_url.find("post") > 0):
    print(sitemap_url)

    if 1: # for testing
      r = requests.get(sitemap_url)
      xml = r.text

## load_sitemap_to_pandas.py
import pandas as pd
print(pd.__version__) #should be 0.23 or later
df = pd.DataFrame.from_dict(sitemaps, orient="index", columns=['lastmod'])
df.head(10)

## prepare_word_cloud.py
from collections import Counter
import re

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

from urllib.parse import urlparse

## create_word_cloud.py
cnt=Counter()
english_stopwords = set(stopwords.words('english'))

for path in df.path:
  words = re.split("[-/]", path)
  for word in words:
    if len(word) > 0 and word not in english_stopwords and not word.isdigit():
      cnt[word] += 1

cnt.most_common(25)

## create_visual_word_cloud.py
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

word_cloud = [x[0] for x in cnt.most_common(25)]

word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))

#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background

plt.imshow(word_cloud_obj, interpolation='bilinear')

## add_word_cloud_to_df.py
def get_category(path):
  words = re.split("[-/]", path)

  for word in words:
    if word in word_cloud:
      return word

  return "other"


## splitting_google_urls_by_1k.py
google_df = df[df["category"] == "google"]

first = google_df[:1000]
second = google_df[1000:2000]
third = google_df[2000:3000]
last = google_df[3000:]


## fake_transaction_pages.py
high_value_pages=df[df.path.str.contains("adwords|facebook|ads|media", regex=True)]

import numpy as np

high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])

high_value_pages=high_value_pages.reset_index()

fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]
	function is_valid_range_(r) {
	return (r.length > 0 && r[0].length === 2);
	}

	function get_range_values_(r) {
	var values = []
	for(var i=0;i<r.length;i++) {
	values.push({ text: r[i][0].trim(), count: r[i][1] });
	}
	return values;
	sitemap_index_url="https://www.searchenginejournal.com/sitemap_index.xml"

	from bs4 import BeautifulSoup
	import requests

	sitemap_index = {}

	r = requests.get(sitemap_index_url)
	xml = r.text
	sitemaps = {}

	for (sitemap_url, lasmod) in sitemap_index.items():
	if(sitemap_url.find("post") > 0):
	print(sitemap_url)

	if 1: # for testing
	r = requests.get(sitemap_url)
	xml = r.text
	import pandas as pd
	print(pd.__version__) #should be 0.23 or later
	df = pd.DataFrame.from_dict(sitemaps, orient="index", columns=['lastmod'])
	df.head(10)
	from collections import Counter
	import re

	import nltk
	from nltk.corpus import stopwords

	nltk.download('stopwords')

	from urllib.parse import urlparse
	cnt=Counter()
	english_stopwords = set(stopwords.words('english'))

	for path in df.path:
	words = re.split("[-/]", path)
	for word in words:
	if len(word) > 0 and word not in english_stopwords and not word.isdigit():
	cnt[word] += 1

	cnt.most_common(25)
	from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
	import matplotlib.pyplot as plt

	word_cloud = [x[0] for x in cnt.most_common(25)]

	word_cloud_obj = WordCloud(max_words=25, background_color="white").generate(" ".join(word_cloud))

	#word_cloud_obj = WordCloud().generate(" ".join(word_cloud)) #default with ugly black background

	plt.imshow(word_cloud_obj, interpolation='bilinear')
	def get_category(path):
	words = re.split("[-/]", path)

	for word in words:
	if word in word_cloud:
	return word

	return "other"
	google_df = df[df["category"] == "google"]

	first = google_df[:1000]
	second = google_df[1000:2000]
	third = google_df[2000:3000]
	last = google_df[3000:]
	high_value_pages=df[df.path.str.contains("adwords\|facebook\|ads\|media", regex=True)]

	import numpy as np

	high_value_pages["fake_transactions"]=np.random.randint(1, 200, high_value_pages.shape[0])

	high_value_pages=high_value_pages.reset_index()

	fake_transaction_pages=high_value_pages[["path", "fake_transactions"]]