This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count | |
FROM listings_df2 | |
WHERE city = "Beijing" | |
GROUP BY neighbourhood_cleansed | |
ORDER BY property_type_count DESC""" | |
neighbourhood_property = spark.sql(neighbourhood_property_sql).cache(); | |
neighbourhood_property.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Plot nr neighbourhoods for top 30 cities | |
neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False) | |
neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe | |
list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31]; | |
num = len(list_neighbourhood_30)*4 | |
c = [i for i in range(0,num,4)] | |
plt.figure(figsize=(10,4)) | |
plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5) | |
plt.xticks(c,list_neighbourhood_30['city'],rotation=90) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Plot nr listings for top 30 cities | |
list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe | |
num = len(list_small)*4 | |
c = [i for i in range(0,num,4)] | |
plt.figure(figsize=(10,4)) | |
plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5) | |
plt.xticks(c,list_small['city'],rotation=90) | |
plt.ylabel('total listing') | |
plt.xlabel('city') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SQL query for getting listings per city and ordering them | |
listing_sql = """SELECT city, count(id) as total_listing, count(DISTINCT neighbourhood_cleansed) as total_neighbourhood | |
FROM listings_df2 | |
GROUP BY city | |
ORDER BY total_listing DESC""" | |
listing_total = spark.sql(listing_sql).cache(); | |
# Show first 30 | |
listing_total.show(30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clean listings data | |
listings_df2 = listings_df.withColumn('price1', regexp_replace('price', '\\$', '')) | |
listings_df2 = listings_df2.withColumn('price1', regexp_replace('price1', ',', '')) | |
listings_df2 = listings_df2.withColumn('price1', col('price1').cast('float')) | |
listings_df2 = listings_df2.withColumn('review_scores_rating1', col('review_scores_rating').cast('float')) | |
listings_df2 = listings_df2.withColumn('listing_value', f.round(col('price1')/col('review_scores_rating1'),2)) | |
listings_df2.createOrReplaceTempView("listings_df2") | |
# Make sure all the datatypes are now as expected | |
listings_df2.printSchema() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
listings_df = listings_df.select("id","price","review_scores_rating",\ | |
"city","neighbourhood_cleansed","property_type").cache() | |
reviews_df = reviews_df.select("listing_id","date", "comments").cache() | |
# Show the first 5 rows (same as head(5) in pandas) | |
print(listings_df.count()) | |
print(reviews_df.count()) | |
listings_df.show(5) | |
reviews_df.show(5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
listings_path = 'your_path/listings.csv' | |
reviews_path = 'your_path/reviews.csv' | |
listings_df = spark.read \ | |
.option('multiLine', 'True') \ | |
.option('escape', '"') \ | |
.option("mode", "DROPMALFORMED")\ | |
.csv(listings_path, header=True) | |
reviews_scheme = StructType([StructField('listing_id', IntegerType(), True), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Perform training loop for n epochs | |
loss_list = [] | |
n_epochs = 10 | |
model.train() | |
for epoch in tqdm(range(n_epochs)): | |
loss_epoch = [] | |
iteration=1 | |
for images,targets in tqdm(data_loader_train): | |
images = list(image.to(device) for image in images) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torchvision | |
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor | |
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor | |
num_classes = 2 | |
# load an instance segmentation model pre-trained pre-trained on COCO | |
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) | |
# get number of input features for the classifier | |
in_features = model.roi_heads.box_predictor.cls_score.in_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WarwickCellDataset(object): | |
def __init__(self, root, transforms=None): # transforms | |
self.root = root | |
# self.transforms = transforms | |
self.transforms=[] | |
if transforms!=None: | |
self.transforms.append(transforms) | |
# load all image files, sorting them to | |
# ensure that they are aligned | |
self.imgs = list(natsorted(os.listdir(os.path.join(root, "image")))) |