eliasdabbas/robots_sitemaps_urls_wordfreq.sh

## robots_sitemaps_urls_wordfreq.sh
# pip install advertools==0.14.0a7

# get the robots.txt file, save to csv:
advertools robots --url https://www.economist.com/robots.txt econ_robots.csv

# find lines that start with sitemap, save to variable sitemap_url
sitemap_url=$(grep ^sitemap -i econ_robots.csv | cut -d , -f 2)

# get the sitemap index file without downloading the sub-sitemaps (not recursive),
advertools sitemaps $sitemap_url econ_sitemap.csv --recursive 0

# select the sitemap for Q1 and fetch it
advertools sitemaps https://www.economist.com/sitemap-2022-Q1.xml econ_q1_sitemap.csv

# get the first column (URLs) save to a text file
cut -d , -f 1 econ_q1_sitemap.csv > econ_urls.txt

# split URLs and save the result to new file
advertools urls econ_urls.txt econ_url_split.csv

# get the 13th column, containing slugs in "last_dir", convert hyphens to spaces, save to new file
cut -d , -f 13 econ_url_split.csv | tr - " " > econ_words.txt

# count words and save counts to new file
advertools wordfreq econ_words.txt econ_word_counts.csv
	# pip install advertools==0.14.0a7

	# get the robots.txt file, save to csv:
	advertools robots --url https://www.economist.com/robots.txt econ_robots.csv

	# find lines that start with sitemap, save to variable sitemap_url
	sitemap_url=$(grep ^sitemap -i econ_robots.csv \| cut -d , -f 2)

	# get the sitemap index file without downloading the sub-sitemaps (not recursive),
	advertools sitemaps $sitemap_url econ_sitemap.csv --recursive 0

	# select the sitemap for Q1 and fetch it
	advertools sitemaps https://www.economist.com/sitemap-2022-Q1.xml econ_q1_sitemap.csv

	# get the first column (URLs) save to a text file
	cut -d , -f 1 econ_q1_sitemap.csv > econ_urls.txt

	# split URLs and save the result to new file
	advertools urls econ_urls.txt econ_url_split.csv

	# get the 13th column, containing slugs in "last_dir", convert hyphens to spaces, save to new file
	cut -d , -f 13 econ_url_split.csv \| tr - " " > econ_words.txt

	# count words and save counts to new file
	advertools wordfreq econ_words.txt econ_word_counts.csv