Skip to content

Instantly share code, notes, and snippets.

@stevehenderson
Created October 28, 2020 04:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stevehenderson/6a999c69d9c82c6a385be39b7850a381 to your computer and use it in GitHub Desktop.
Save stevehenderson/6a999c69d9c82c6a385be39b7850a381 to your computer and use it in GitHub Desktop.
import scrapy
import re
import json
class FacultySpider(scrapy.Spider):
name = "faculty"
def start_requests(self):
urls = [
'https://compsci.uncg.edu/faculty/minjeong-kim/',
'https://compsci.uncg.edu/faculty/sami-khuri/',
'https://compsci.uncg.edu/faculty/ariyawansa/',
'https://compsci.uncg.edu/faculty/armstrong/',
'https://compsci.uncg.edu/faculty/green/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
name = response.xpath("/html/body/div[2]/div[1]/div/div/article/div[4]/div[2]/div/h1").get()
email = response.xpath("/html/body/div[2]/div[1]/div/div/article/div[4]/div[2]/div/div[2]/p[2]/a[1]").get()
institution = response.xpath("/html/body/div[1]/nav/div/div[1]/div[1]/a").get()
school = response.xpath("/html/body/div[2]/div[1]/div/div/article/div[1]/a").get()
title = response.xpath("/html/body/div[2]/div[1]/div/div/article/div[4]/div[2]/div/div[2]/p[1]/strong").get()
phone = response.xpath("/html/body/div[2]/div[3]/div/div[1]/p/text()[6]").get()
#print("NAME !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! {}".format(name))
person_info = response.xpath("/html/body/div[2]/div[1]/div/div/article/div[4]/div[2]/div/div[2]/p[3]").get()
#print("PERSON INFO !!!!!!!!!!!!!!!! {}".format(person_info))
research = person_info.split("Teaching")[0]
teaching = "Teaching: " + person_info.split("Teaching")[1]
#Cleanup
name = re.sub('<[^<]+?>|\t', '', name)
teaching = re.sub('<[^<]+?>|\t', '', teaching)
research = re.sub('<[^<]+?>|\t', '', research)
email = re.sub('<[^<]+?>|\t', '', email)
institution = re.sub('<[^<]+?>|\t|\r\n|\s\s', '', institution)
school = re.sub('<[^<]+?>|\t|\r\n|\s\s', '', school)
title = re.sub('<[^<]+?>|\t|\r\n|\s\s', '', title)
phone = re.sub('<[^<]+?>|\t|\r\n|\s\s|', '', phone)
#print("TEACHING !!!!!!! {}".format(teaching))
#print("RESEARCH !!!!!!! {}".format())
faculty = {}
faculty['name'] = name
faculty['institution'] = institution
faculty['school'] = school
faculty['department'] = school
faculty['title'] = title
faculty['phone'] = phone
faculty['email'] = email
faculty['teaching'] = teaching
faculty['research'] = research
print(json.dumps(faculty))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment