Joshuacourse/Web_scraping.py

## Web_scraping.py
### Scraping using package "Beautiful Soup" from Python
### Scrape the links for every countries and store the data in a txt file called "record3.txt"
f = open('record.txt', 'r') # 'r' for read
lines3 = f.readlines()
f.close()
for i in range(len(lines3)):
  url="http://www.alexa.com/siteinfo/" + lines3[i]
  text = requests.get(url).text
  soup = BeautifulSoup(text)
  TT=soup.find("span",{"data-cat":"countryRank"}).find("a").get_text()
  f4 = open('record3.txt', 'a')
  f4.write(TT+"\n")
  f4.close()

  f2 = open('record1.txt', 'r') # 'r' for read
lines = f2.readlines()
f2.close()
print lines
## "http://www.alexa.com" + lines[1].strip()
f3 = open('record2.txt', 'r') # 'r' for read
lines2 = f3.readlines()
f3.close()


### Create an empty dictionary "M" for the storage of the Top-500 websites for each country
M=OrderedDict()
print lines2
for country in lines2:
    M[country.strip()]=[]
print M

### export the dictionary "M" as json file
import json
json_str = json.dumps(M)
print(M)
f = open('Site.json', 'wb')
f.write(json_str)
f.close()
	### Scraping using package "Beautiful Soup" from Python
	### Scrape the links for every countries and store the data in a txt file called "record3.txt"
	f = open('record.txt', 'r') # 'r' for read
	lines3 = f.readlines()
	f.close()
	for i in range(len(lines3)):
	url="http://www.alexa.com/siteinfo/" + lines3[i]
	text = requests.get(url).text
	soup = BeautifulSoup(text)
	TT=soup.find("span",{"data-cat":"countryRank"}).find("a").get_text()
	f4 = open('record3.txt', 'a')
	f4.write(TT+"\n")
	f4.close()

	f2 = open('record1.txt', 'r') # 'r' for read
	lines = f2.readlines()
	f2.close()
	print lines
	## "http://www.alexa.com" + lines[1].strip()
	f3 = open('record2.txt', 'r') # 'r' for read
	lines2 = f3.readlines()
	f3.close()


	### Create an empty dictionary "M" for the storage of the Top-500 websites for each country
	M=OrderedDict()
	print lines2
	for country in lines2:
	M[country.strip()]=[]
	print M

	### export the dictionary "M" as json file
	import json
	json_str = json.dumps(M)
	print(M)
	f = open('Site.json', 'wb')
	f.write(json_str)
	f.close()