Skip to content

Instantly share code, notes, and snippets.

Avatar

Bjørn Hansen bh1995

View GitHub Profile
View word_score
import re
# Number of reviews to use
N_total = 32261216
# Set word occurence lower threshold (must occur in 0.5 percent of the number of reviews)
N_limit = 0.005 * N_total
score_threshold = 90
comments_cleaned = reviews_with_score_df\
.select(['comments', 'mean_score'])\
.limit(N_total)\
View filtering words
# Filter all reviews
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache()
listings_score_only_df = listings_score_only_df.withColumn('id',
listings_score_only_df['id'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('id').isNotNull())
listings_score_only_df = listings_score_only_df.withColumn('review_scores_rating',
listings_score_only_df['review_scores_rating'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('review_scores_rating').isNotNull())
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache()
View spark_session
# create the Spark session
conf = SparkConf().set("spark.ui.port", "4050")
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
View seasonal_trend
beijing_popularity_month = beijing_popularity.groupBy(f.month(beijing_popularity["date"]).alias("month")).count().sort("month", ascending=True)
beijing_popularity_month_pd = beijing_popularity_month.toPandas()
num = len(beijing_popularity_month_pd)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, beijing_popularity_month_pd['count'],width=3, align='center', alpha=0.5)
plt.xticks(c,beijing_popularity_month_pd["month"])
plt.ylabel('Popularity')
plt.xlabel('Month')
plt.title('The Popularity of Beijing by month')
View beijing_popularity
beijing_popularity_sql = """SELECT a.date, b.city,b.neighbourhood_cleansed
FROM reviews_df a
LEFT JOIN listings_df2 b
on a.listing_id = b.id
WHERE b.city='Beijing'
ORDER BY date ASC;
"""
beijing_popularity = spark.sql(beijing_popularity_sql).cache()
beijing_popularity_pd = beijing_popularity.toPandas()
# plot
View beijing_listing_price_filter
lowPrice = 2000
beijing_price_pd = beijing_price.filter(beijing_price["price"]<lowPrice).toPandas()
# plot
plt.figure(figsize = [16,7])# set figuresize
plt.hist(beijing_price_low_pd['price'],bins = 50,alpha = 0.5,color = 'red',edgecolor = 'white', linewidth = 1.2)
plt.xlabel('price',size = 15)
plt.ylabel('count',size = 15)
plt.title('Histogram of Bei Jing price distribution of below 2000$ price',size = 15)
plt.show()
View beijing_listing_price
beijing_price_sql = """SELECT price1 as price
FROM listings_df2
WHERE city = 'Beijing'
"""
beijing_price = spark.sql(beijing_price_sql).cache()
beijing_price_pd = beijing_price.toPandas()
# plot
plt.figure(figsize = [16,7])# set figuresize
plt.hist(beijing_price_pd['price'],bins = 50,alpha = 0.5,color = 'red',edgecolor = 'white', linewidth = 1.2)
plt.xlabel('price',size = 15)
View beijing_neighbourhood
neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count
FROM listings_df2
WHERE city = "Beijing"
GROUP BY neighbourhood_cleansed
ORDER BY property_type_count DESC"""
neighbourhood_property = spark.sql(neighbourhood_property_sql).cache();
neighbourhood_property.show()
View order_neighbourhood
# Plot nr neighbourhoods for top 30 cities
neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False)
neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe
list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31];
num = len(list_neighbourhood_30)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5)
plt.xticks(c,list_neighbourhood_30['city'],rotation=90)
View plot_listings_pandas
# Plot nr listings for top 30 cities
list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe
num = len(list_small)*4
c = [i for i in range(0,num,4)]
plt.figure(figsize=(10,4))
plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5)
plt.xticks(c,list_small['city'],rotation=90)
plt.ylabel('total listing')
plt.xlabel('city')