Skip to content

Instantly share code, notes, and snippets.

View khunreus's full-sized avatar

Anastasia Reusova khunreus

View GitHub Profile
@khunreus
khunreus / geocoded_Tweets.R
Created February 9, 2018 01:57 — forked from dsparks/geocoded_Tweets.R
Gathering Tweets, geocoding users, and plotting them
doInstall <- TRUE
toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
searchTerm <- "#rstats"
searchResults <- searchTwitter(searchTerm, n = 1000) # Gather Tweets
tweetFrame <- twListToDF(searchResults) # Convert to a nice dF
userInfo <- lookupUsers(tweetFrame$screenName) # Batch lookup of user info
@khunreus
khunreus / install_modules_start_scrapy_project.txt
Last active February 9, 2019 14:10
airbnb scraping sample
#Command line
pip install scrapy
pip install selenium
#cd your/desired/project/path/
scrapy startproject airbnb #will create a project "airbnb" in the folder you are in
cd airbnb/ #need to cd to a folder with a scrapy.cfg file
#this is the test address to kick off the project
#homes.py created in airbnb/airbnb/spiders/
@khunreus
khunreus / test_the_webdriver.py
Last active February 9, 2019 18:35
airbnb scraping sample 2
# ipython
import scrapy
import selenium
from scrapy.selector import Selector
from selenium import webdriver
driver = webdriver.Chrome('path/to/the/chromedriver')
driver.get('https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7')
scrapy_selector = Selector(text = driver.page_source)
"""
python3.6
part of the Scrapy architecture
goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property
"""
from time import sleep
import scrapy, selenium
from selenium import webdriver
from scrapy import Spider
"""
python3.6
downloads images from URLs stored in a csv file according to labels
and saves them to a specified directory / subdirectory
e.g. for sorting images into folders by labels for image classification
"""
import pandas as pd
import urllib
import os
# Command line
scrapy shell 'https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7'
# testing selector for the property name
response.xpath('//*[@class="_ng4pvpo"]')
# Out[1]: []
response.xpath('//*[@itemtype="http://schema.org/ListItem"]')
# Out[2]: []
response.xpath('//*[@itemprop="name"]')
# Out[3]: []
"""
python3.6
ipython console test
"""
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
self.header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
"""
python3.6
ipython console test
"""
from scrapy.selector import Selector
scrapy_selector = Selector(text = self.driver.page_source)
homes_selector = scrapy_selector.xpath('//*[@itemtype="http://schema.org/ListItem"]') #name of an item can be changed by Airbnb
try:
"""
python3.6
"""
from time import sleep
import scrapy
import selenium
from scrapy import Spider
from scrapy.selector import Selector
"""
python3.6
Scrapy + Selenium
"""
for profile_url in profile_urls_distinct:
self.logger.info('Home #' + str(q))
self.driver.get(profile_url)
q = q+1
sleep(10)
link_to_home = profile_url