Anastasia Reusova khunreus

## geocoded_Tweets.R
doInstall <- TRUE
toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)

searchTerm <- "#rstats"
searchResults <- searchTwitter(searchTerm, n = 1000)  # Gather Tweets
tweetFrame <- twListToDF(searchResults)  # Convert to a nice dF

userInfo <- lookupUsers(tweetFrame$screenName)  # Batch lookup of user info

## install_modules_start_scrapy_project.txt
#Command line
pip install scrapy
pip install selenium

#cd your/desired/project/path/
scrapy startproject airbnb #will create a project "airbnb" in the folder you are in
cd airbnb/ #need to cd to a folder with a scrapy.cfg file

#this is the test address to kick off the project
#homes.py created in airbnb/airbnb/spiders/

## test_the_webdriver.py
# ipython
import scrapy
import selenium
from scrapy.selector import Selector
from selenium import webdriver

driver = webdriver.Chrome('path/to/the/chromedriver')
driver.get('https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7')

scrapy_selector = Selector(text = driver.page_source)

## airbnb_product_catalog_scraper.py
"""
python3.6
part of the Scrapy architecture
goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property
"""

from time import sleep
import scrapy, selenium
from selenium import webdriver
from scrapy import Spider

## download_images_from_a_csv.py
"""
python3.6
downloads images from URLs stored in a csv file according to labels
and saves them to a specified directory / subdirectory
e.g. for sorting images into folders by labels for image classification
"""

import pandas as pd
import urllib
import os

## airbnb_scrapy_shell_test.txt
# Command line
scrapy shell 'https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7'
# testing selector for the property name
response.xpath('//*[@class="_ng4pvpo"]')
# Out[1]: []
response.xpath('//*[@itemtype="http://schema.org/ListItem"]')
# Out[2]: []
response.xpath('//*[@itemprop="name"]')
# Out[3]: []

## airbnb_search_navigation.py
"""
python3.6
ipython console test
"""
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

self.header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}

## airbnb_catalogue_details_iterative.py
"""
python3.6
ipython console test
"""

from scrapy.selector import Selector

scrapy_selector = Selector(text = self.driver.page_source)
homes_selector = scrapy_selector.xpath('//*[@itemtype="http://schema.org/ListItem"]') #name of an item can be changed by Airbnb
try:

## airbnb_catalogue_crawler_with_pdp.py
"""
python3.6
"""
from time import sleep

import scrapy
import selenium

from scrapy import Spider
from scrapy.selector import Selector

## airbnb_get_pdp - before reviews.py
"""
python3.6
Scrapy + Selenium
"""
for profile_url in profile_urls_distinct:
    self.logger.info('Home #' + str(q))
    self.driver.get(profile_url)
    q = q+1
    sleep(10)
    link_to_home = profile_url
	doInstall <- TRUE
	toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
	if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
	lapply(toInstall, library, character.only = TRUE)

	searchTerm <- "#rstats"
	searchResults <- searchTwitter(searchTerm, n = 1000) # Gather Tweets
	tweetFrame <- twListToDF(searchResults) # Convert to a nice dF

	userInfo <- lookupUsers(tweetFrame$screenName) # Batch lookup of user info
	#Command line
	pip install scrapy
	pip install selenium

	#cd your/desired/project/path/
	scrapy startproject airbnb #will create a project "airbnb" in the folder you are in
	cd airbnb/ #need to cd to a folder with a scrapy.cfg file

	#this is the test address to kick off the project
	#homes.py created in airbnb/airbnb/spiders/
	# ipython
	import scrapy
	import selenium
	from scrapy.selector import Selector
	from selenium import webdriver

	driver = webdriver.Chrome('path/to/the/chromedriver')
	driver.get('https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7')

	scrapy_selector = Selector(text = driver.page_source)
	"""
	python3.6
	part of the Scrapy architecture
	goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property
	"""

	from time import sleep
	import scrapy, selenium
	from selenium import webdriver
	from scrapy import Spider
	"""
	python3.6
	downloads images from URLs stored in a csv file according to labels
	and saves them to a specified directory / subdirectory
	e.g. for sorting images into folders by labels for image classification
	"""

	import pandas as pd
	import urllib
	import os
	# Command line
	scrapy shell 'https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7'
	# testing selector for the property name
	response.xpath('//*[@class="_ng4pvpo"]')
	# Out[1]: []
	response.xpath('//*[@itemtype="http://schema.org/ListItem"]')
	# Out[2]: []
	response.xpath('//*[@itemprop="name"]')
	# Out[3]: []
	"""
	python3.6
	ipython console test
	"""
	import selenium
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys

	self.header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
	"""
	python3.6
	Scrapy + Selenium
	"""
	for profile_url in profile_urls_distinct:
	self.logger.info('Home #' + str(q))
	self.driver.get(profile_url)
	q = q+1
	sleep(10)
	link_to_home = profile_url