nmolivo/tesu_scraper_03

## tesu_scraper_03
  #open doc, from folder 'docs', extract XML coding
  pathway = '/home/ec2-user/ec2docs/'+file
  document = zipfile.ZipFile(pathway)
  xml_content = document.read('word/document.xml')
  document.close()
  xml_str = str(xml_content)

  #create linklist for doc, by going through the XML and finding the links
  link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.

  link_list = [x[1:-1] for x in link_list] #shaves off the last character of each item in the list. (it's a '<')
  #replace &amp; with &, and other html entities.
  link_list = [html.unescape(x) for x in link_list]
	#open doc, from folder 'docs', extract XML coding
	pathway = '/home/ec2-user/ec2docs/'+file
	document = zipfile.ZipFile(pathway)
	xml_content = document.read('word/document.xml')
	document.close()
	xml_str = str(xml_content)

	#create linklist for doc, by going through the XML and finding the links
	link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.

	link_list = [x[1:-1] for x in link_list] #shaves off the last character of each item in the list. (it's a '<')
	#replace & with &, and other html entities.
	link_list = [html.unescape(x) for x in link_list]