Skip to content

Instantly share code, notes, and snippets.

def parse_main_page(self, response):
ids = response.xpath('//*[@class="row"]/@data-pid').extract()
for id in ids:
link = 'https://newyork.craigslist.org/stn/cto/' + str(id) + '.html'
yield Request(link, callback=self.parse_detail_page)
def parse_detail_page(self, response):
price = response.xpath('//*[@class = "price"]/text()').extract()[0]
title = response.xpath('//*[@ id = "titletextonly"]/text()').extract()[0]
post_time=response.xpath('//*[@id = "pagecontainer"]/section/section/div[2]/p[2]/time/text()').extract()[0]
from scrapy import Spider, Request
from scrapy.selector import Selector
from demo.items import DemoItem
class DemoSpider(Spider):
name = 'demo'
allowed_urls = ['https://newyork.craigslist.org']
start_urls = ['https://newyork.craigslist.org/search/stn/cto']
def parse(self, response):
#########################################################################################################################
# Summarizing the full df containing more than 22 millions rows by grouping by year,state,gender and then finding the sum
#########################################################################################################################
home_all_year_borr_male_female_only_count= group_by(home_all_year_borr_male_female_only,Year,US.Postal.code,Borrower.Gender) %>% summarise(Borrower.Gender.count=n())
dim(home_all_year_borr_male_female_only_count)
#540 4 !!! That makes perfect sense , 5 years * 54 states * 2 gender
#####################################
#Finally,binding all rows from years 2014 t0 2010 together
#########################################
home_all_year_male_female=rbind(home_2014_male_female,home_2013_male_female,home_2012_male_female,home_2011_male_female,home_2010_male_female)
dim(home_all_year_male_female)
#22298905
###################################
#####Checking Missing Data#####
##################################
#Checking how many US_Postal_codes have 00
sum(home_FNM_2014$V3 == 00)
# 2014 Fannie Mac have 171 rows whose state code is 00
# Deleting those rows
fnm_rows_0=which(home_FNM_2014$V3== 0)
###################################################
#Reading the tables
########################################################
# 2014 Fannie Mae File, Enterprise Code is 1
home_FNM_2014=read.table("~/Documents/shiny_project/home_all_year/2014_SFCensusTractFNM2014/fnma_sf2014c_loans.txt",header=FALSE,sep="")
dim(home_FNM_2014)
#1899729 39
#2014 Freddie Mac File, Enterprise Code is 2
home_FRE_2014=read.table("~/Documents/shiny_project/home_all_year/2014_SFCensusTractFRE2014/fhlmc_sf2014c_loans.txt",header=FALSE,sep="")
dim(home_FRE_2014)
##Calculating monthly default rate from April 2005 to September 2005
default_rate=vector(length=6)
names(default_rate)= c("April 2005","May 2005","June 2005","July 2005","August 2005","September 2005")
c("April 2005","May 2005","June 2005","July 2005","August 2005","September 2005")
default_rate[6]= percent(nrow(filter(credi_tbl_temp,PAY_0 > 2 ))/total_rows)
default_rate[5]= percent(nrow(filter(credi_tbl_temp,PAY_2 > 2 ))/total_rows)
default_rate[4]= percent(nrow(filter(credi_tbl_temp,PAY_3 > 2 ))/total_rows)
default_rate[3]= percent(nrow(filter(credi_tbl_temp,PAY_4 > 2 ))/total_rows)
default_rate[2]= percent(nrow(filter(credi_tbl_temp,PAY_5 > 2 ))/total_rows)
default_rate[1]= percent(nrow(filter(credi_tbl_temp,PAY_6 > 2 ))/total_rows)
# Changing Marital status numerical values to facors
married_vec=c(1,2,3)
married_desc=c("married","single","others")
credi_tbl_temp$MARRIAGE=factor(x=credit_tbl$MARRIAGE,levels=married_vec,labels=married_desc)
# Crearing Marrital Status bar chart with fill
m=ggplot(data = credi_tbl_temp, aes(x = default.payment.next.month)) +
geom_bar(aes (fill = MARRIAGE), position = "fill") + ggtitle("Marital Status of Default Vs. Non Default")+ xlab("")
m
# Violin Plot
p= ggplot(data = credi_tbl_temp, aes(x = default.payment.next.month,y=AGE)) +
geom_violin(aes(fill =default.payment.next.month ))+ ggtitle("Age Profile of Default Payment Vs.Non Default")+xlab("")
p
# Creating Density plots
g4= g + geom_density(aes(color = default.payment.next.month ))+ggtitle("Density Vs. Age Profile")+ylab("Denisty")
g4
Creating Continuency table for sex
#filter(credi_tbl_temp,default.payment.next.month==1) %>% group_by(SEX) %>% summarise(count(SEX))
table(credi_tbl_temp,credi_tbl_temp$SEX)