Bjørn Hansen bh1995

## beijing_neighbourhood
neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count
FROM listings_df2
WHERE city = "Beijing"
GROUP BY neighbourhood_cleansed
ORDER BY property_type_count DESC"""
neighbourhood_property = spark.sql(neighbourhood_property_sql).cache();
neighbourhood_property.show()

## order_neighbourhood
# Plot nr neighbourhoods for top 30 cities
neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False)
neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe
list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31];
num = len(list_neighbourhood_30)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5)

plt.xticks(c,list_neighbourhood_30['city'],rotation=90)

## plot_listings_pandas
# Plot nr listings for top 30 cities
list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe
num = len(list_small)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5)
plt.xticks(c,list_small['city'],rotation=90)

plt.ylabel('total listing')
plt.xlabel('city')

## listing_sql
# SQL query for getting listings per city and ordering them
listing_sql = """SELECT city, count(id) as total_listing, count(DISTINCT neighbourhood_cleansed) as total_neighbourhood
FROM listings_df2
GROUP BY city
ORDER BY total_listing DESC"""
listing_total = spark.sql(listing_sql).cache();
# Show first 30
listing_total.show(30)

## clean_dataframes
# clean listings data
listings_df2 = listings_df.withColumn('price1', regexp_replace('price', '\\$', ''))
listings_df2 = listings_df2.withColumn('price1', regexp_replace('price1', ',', ''))
listings_df2 = listings_df2.withColumn('price1', col('price1').cast('float'))
listings_df2 = listings_df2.withColumn('review_scores_rating1', col('review_scores_rating').cast('float'))
listings_df2 = listings_df2.withColumn('listing_value', f.round(col('price1')/col('review_scores_rating1'),2))
listings_df2.createOrReplaceTempView("listings_df2")
# Make sure all the datatypes are now as expected
listings_df2.printSchema()

## select_show
listings_df = listings_df.select("id","price","review_scores_rating",\
                                 "city","neighbourhood_cleansed","property_type").cache()
reviews_df = reviews_df.select("listing_id","date", "comments").cache()
# Show the first 5 rows (same as head(5) in pandas)
print(listings_df.count())
print(reviews_df.count())
listings_df.show(5)
reviews_df.show(5)

## airbnb_load_data
listings_path = 'your_path/listings.csv'
reviews_path = 'your_path/reviews.csv'

listings_df = spark.read \
  .option('multiLine', 'True') \
  .option('escape', '"') \
.option("mode", "DROPMALFORMED")\
  .csv(listings_path, header=True)

reviews_scheme = StructType([StructField('listing_id', IntegerType(), True),

## maskrcnn_train_loop
# Perform training loop for n epochs
loss_list = []
n_epochs = 10
model.train()
for epoch in tqdm(range(n_epochs)):
    loss_epoch = []
    iteration=1
    for images,targets in tqdm(data_loader_train):

        images = list(image.to(device) for image in images)

## torchvision_maskrcnn
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

num_classes = 2
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

## WarwickCellDataset
class WarwickCellDataset(object):
    def __init__(self, root, transforms=None): # transforms
        self.root = root
        # self.transforms = transforms
        self.transforms=[]
        if transforms!=None:
          self.transforms.append(transforms)
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(natsorted(os.listdir(os.path.join(root, "image"))))
	neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count
	FROM listings_df2
	WHERE city = "Beijing"
	GROUP BY neighbourhood_cleansed
	ORDER BY property_type_count DESC"""
	neighbourhood_property = spark.sql(neighbourhood_property_sql).cache();
	neighbourhood_property.show()
	# Plot nr neighbourhoods for top 30 cities
	neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False)
	neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe
	list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31];
	num = len(list_neighbourhood_30)*4
	c = [i for i in range(0,num,4)]
	plt.figure(figsize=(10,4))
	plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5)

	plt.xticks(c,list_neighbourhood_30['city'],rotation=90)
	# Plot nr listings for top 30 cities
	list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe
	num = len(list_small)*4
	c = [i for i in range(0,num,4)]
	plt.figure(figsize=(10,4))
	plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5)
	plt.xticks(c,list_small['city'],rotation=90)

	plt.ylabel('total listing')
	plt.xlabel('city')
	# SQL query for getting listings per city and ordering them
	listing_sql = """SELECT city, count(id) as total_listing, count(DISTINCT neighbourhood_cleansed) as total_neighbourhood
	FROM listings_df2
	GROUP BY city
	ORDER BY total_listing DESC"""
	listing_total = spark.sql(listing_sql).cache();
	# Show first 30
	listing_total.show(30)
	# clean listings data
	listings_df2 = listings_df.withColumn('price1', regexp_replace('price', '\\$', ''))
	listings_df2 = listings_df2.withColumn('price1', regexp_replace('price1', ',', ''))
	listings_df2 = listings_df2.withColumn('price1', col('price1').cast('float'))
	listings_df2 = listings_df2.withColumn('review_scores_rating1', col('review_scores_rating').cast('float'))
	listings_df2 = listings_df2.withColumn('listing_value', f.round(col('price1')/col('review_scores_rating1'),2))
	listings_df2.createOrReplaceTempView("listings_df2")
	# Make sure all the datatypes are now as expected
	listings_df2.printSchema()
	listings_df = listings_df.select("id","price","review_scores_rating",\
	"city","neighbourhood_cleansed","property_type").cache()
	reviews_df = reviews_df.select("listing_id","date", "comments").cache()
	# Show the first 5 rows (same as head(5) in pandas)
	print(listings_df.count())
	print(reviews_df.count())
	listings_df.show(5)
	reviews_df.show(5)
	listings_path = 'your_path/listings.csv'
	reviews_path = 'your_path/reviews.csv'

	listings_df = spark.read \
	.option('multiLine', 'True') \
	.option('escape', '"') \
	.option("mode", "DROPMALFORMED")\
	.csv(listings_path, header=True)

	reviews_scheme = StructType([StructField('listing_id', IntegerType(), True),
	# Perform training loop for n epochs
	loss_list = []
	n_epochs = 10
	model.train()
	for epoch in tqdm(range(n_epochs)):
	loss_epoch = []
	iteration=1
	for images,targets in tqdm(data_loader_train):

	images = list(image.to(device) for image in images)
	import torchvision
	from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
	from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

	num_classes = 2
	# load an instance segmentation model pre-trained pre-trained on COCO
	model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

	# get number of input features for the classifier
	in_features = model.roi_heads.box_predictor.cls_score.in_features
	class WarwickCellDataset(object):
	def __init__(self, root, transforms=None): # transforms
	self.root = root
	# self.transforms = transforms
	self.transforms=[]
	if transforms!=None:
	self.transforms.append(transforms)
	# load all image files, sorting them to
	# ensure that they are aligned
	self.imgs = list(natsorted(os.listdir(os.path.join(root, "image"))))