View word_score
import re | |
# Number of reviews to use | |
N_total = 32261216 | |
# Set word occurence lower threshold (must occur in 0.5 percent of the number of reviews) | |
N_limit = 0.005 * N_total | |
score_threshold = 90 | |
comments_cleaned = reviews_with_score_df\ | |
.select(['comments', 'mean_score'])\ | |
.limit(N_total)\ |
View filtering words
# Filter all reviews | |
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache() | |
listings_score_only_df = listings_score_only_df.withColumn('id', | |
listings_score_only_df['id'].cast(IntegerType())) | |
listings_score_only_df = listings_score_only_df.filter(col('id').isNotNull()) | |
listings_score_only_df = listings_score_only_df.withColumn('review_scores_rating', | |
listings_score_only_df['review_scores_rating'].cast(IntegerType())) | |
listings_score_only_df = listings_score_only_df.filter(col('review_scores_rating').isNotNull()) | |
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache() |
View spark_session
# create the Spark session | |
conf = SparkConf().set("spark.ui.port", "4050") | |
sc = pyspark.SparkContext(conf=conf) | |
spark = SparkSession.builder.getOrCreate() |
View seasonal_trend
beijing_popularity_month = beijing_popularity.groupBy(f.month(beijing_popularity["date"]).alias("month")).count().sort("month", ascending=True) | |
beijing_popularity_month_pd = beijing_popularity_month.toPandas() | |
num = len(beijing_popularity_month_pd)*4 | |
c = [i for i in range(0,num,4)] | |
plt.figure(figsize=(10,4)) | |
plt.bar(c, beijing_popularity_month_pd['count'],width=3, align='center', alpha=0.5) | |
plt.xticks(c,beijing_popularity_month_pd["month"]) | |
plt.ylabel('Popularity') | |
plt.xlabel('Month') | |
plt.title('The Popularity of Beijing by month') |
View beijing_popularity
beijing_popularity_sql = """SELECT a.date, b.city,b.neighbourhood_cleansed | |
FROM reviews_df a | |
LEFT JOIN listings_df2 b | |
on a.listing_id = b.id | |
WHERE b.city='Beijing' | |
ORDER BY date ASC; | |
""" | |
beijing_popularity = spark.sql(beijing_popularity_sql).cache() | |
beijing_popularity_pd = beijing_popularity.toPandas() | |
# plot |
View beijing_listing_price_filter
lowPrice = 2000 | |
beijing_price_pd = beijing_price.filter(beijing_price["price"]<lowPrice).toPandas() | |
# plot | |
plt.figure(figsize = [16,7])# set figuresize | |
plt.hist(beijing_price_low_pd['price'],bins = 50,alpha = 0.5,color = 'red',edgecolor = 'white', linewidth = 1.2) | |
plt.xlabel('price',size = 15) | |
plt.ylabel('count',size = 15) | |
plt.title('Histogram of Bei Jing price distribution of below 2000$ price',size = 15) | |
plt.show() |
View beijing_listing_price
beijing_price_sql = """SELECT price1 as price | |
FROM listings_df2 | |
WHERE city = 'Beijing' | |
""" | |
beijing_price = spark.sql(beijing_price_sql).cache() | |
beijing_price_pd = beijing_price.toPandas() | |
# plot | |
plt.figure(figsize = [16,7])# set figuresize | |
plt.hist(beijing_price_pd['price'],bins = 50,alpha = 0.5,color = 'red',edgecolor = 'white', linewidth = 1.2) | |
plt.xlabel('price',size = 15) |
View beijing_neighbourhood
neighbourhood_property_sql = """SELECT neighbourhood_cleansed, count(DISTINCT property_type) as property_type_count | |
FROM listings_df2 | |
WHERE city = "Beijing" | |
GROUP BY neighbourhood_cleansed | |
ORDER BY property_type_count DESC""" | |
neighbourhood_property = spark.sql(neighbourhood_property_sql).cache(); | |
neighbourhood_property.show() |
View order_neighbourhood
# Plot nr neighbourhoods for top 30 cities | |
neighbourhood_list = listing_total.sort("total_neighbourhood", ascending=False) | |
neighbourhood_list_pd = neighbourhood_list.toPandas() # convert to pandas dataframe | |
list_neighbourhood_30 = neighbourhood_list_pd.iloc[1:31]; | |
num = len(list_neighbourhood_30)*4 | |
c = [i for i in range(0,num,4)] | |
plt.figure(figsize=(10,4)) | |
plt.bar(c, list_neighbourhood_30['total_neighbourhood'],width=3, align='center', alpha=0.5) | |
plt.xticks(c,list_neighbourhood_30['city'],rotation=90) |
View plot_listings_pandas
# Plot nr listings for top 30 cities | |
list_small = listing_total.limit(30).toPandas() # Convert to pandas dataframe | |
num = len(list_small)*4 | |
c = [i for i in range(0,num,4)] | |
plt.figure(figsize=(10,4)) | |
plt.bar(c, list_small['total_listing'],width=3, align='center', alpha=0.5) | |
plt.xticks(c,list_small['city'],rotation=90) | |
plt.ylabel('total listing') | |
plt.xlabel('city') |
NewerOlder