Skip to content

Instantly share code, notes, and snippets.

View bh1995's full-sized avatar

Bjørn Hansen bh1995

View GitHub Profile
neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count
FROM listings_df2
WHERE city = "Beijing"
GROUP BY neighbourhood_cleansed
ORDER BY property_type_count DESC"""
neighbourhood_property = spark.sql(neighbourhood_property_sql).cache();
neighbourhood_property.show()
# Plot nr neighbourhoods for top 30 cities
neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False)
neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe
list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31];
num = len(list_neighbourhood_30)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5)
plt.xticks(c,list_neighbourhood_30['city'],rotation=90)
# Plot nr listings for top 30 cities
list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe
num = len(list_small)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5)
plt.xticks(c,list_small['city'],rotation=90)
plt.ylabel('total listing')
plt.xlabel('city')
# SQL query for getting listings per city and ordering them
listing_sql = """SELECT city, count(id) as total_listing, count(DISTINCT neighbourhood_cleansed) as total_neighbourhood
FROM listings_df2
GROUP BY city
ORDER BY total_listing DESC"""
listing_total = spark.sql(listing_sql).cache();
# Show first 30
listing_total.show(30)
# clean listings data
listings_df2 = listings_df.withColumn('price1', regexp_replace('price', '\\$', ''))
listings_df2 = listings_df2.withColumn('price1', regexp_replace('price1', ',', ''))
listings_df2 = listings_df2.withColumn('price1', col('price1').cast('float'))
listings_df2 = listings_df2.withColumn('review_scores_rating1', col('review_scores_rating').cast('float'))
listings_df2 = listings_df2.withColumn('listing_value', f.round(col('price1')/col('review_scores_rating1'),2))
listings_df2.createOrReplaceTempView("listings_df2")
# Make sure all the datatypes are now as expected
listings_df2.printSchema()
listings_df = listings_df.select("id","price","review_scores_rating",\
"city","neighbourhood_cleansed","property_type").cache()
reviews_df = reviews_df.select("listing_id","date", "comments").cache()
# Show the first 5 rows (same as head(5) in pandas)
print(listings_df.count())
print(reviews_df.count())
listings_df.show(5)
reviews_df.show(5)
listings_path = 'your_path/listings.csv'
reviews_path = 'your_path/reviews.csv'
listings_df = spark.read \
.option('multiLine', 'True') \
.option('escape', '"') \
.option("mode", "DROPMALFORMED")\
.csv(listings_path, header=True)
reviews_scheme = StructType([StructField('listing_id', IntegerType(), True),
# Perform training loop for n epochs
loss_list = []
n_epochs = 10
model.train()
for epoch in tqdm(range(n_epochs)):
loss_epoch = []
iteration=1
for images,targets in tqdm(data_loader_train):
images = list(image.to(device) for image in images)
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
num_classes = 2
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
class WarwickCellDataset(object):
def __init__(self, root, transforms=None): # transforms
self.root = root
# self.transforms = transforms
self.transforms=[]
if transforms!=None:
self.transforms.append(transforms)
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(natsorted(os.listdir(os.path.join(root, "image"))))