Skip to content

Instantly share code, notes, and snippets.

View serenamm's full-sized avatar

Serena McDonnell serenamm

  • Toronto / Vancouver / Hong Kong
View GitHub Profile
@serenamm
serenamm / twitter_insights.md
Last active February 19, 2020 02:43
Usefulness of Twitter data in making investment decisions

Usefulness of Twitter data in making investment decisions

We had no part in generating the following charts. We are looking for your opinion on whether they can be useful in gathering information for trading purposes.

Part 1: Questions about Figures

Hashtag Frequency

Hashtags are used widely on Twitter. Here we look at the most popular hashtags from a subset of Twitter users, from yesterday.

$x=n^2$
from mock import mock
create_table_query = '''
SELECT
item_id_1,
item_id_2
FROM (
SELECT
item_id_1,
item_id_2,
def test_create_new_table(mocker):
# Mock all our variables
mock_spark = mock.Mock()
mock_category_q = mock.Mock()
mock_created_table = mock.Mock()
mock_created_table_coalesced = mock.Mock()
# Calling spark.sql with create_table_query returns created_table - we need to mock it
mock_spark.sql.side_effect = [mock_created_table]
# Mock the output of calling .coalesce on created_table
def test_make_query_true(mocker):
# Create some fake table paths
test_paths = {
"product_table": {
"table": "products",
},
"similarity_table": {
"table": "product_similarity"
}
import pyspark
from pyspark.sql import SparkSession
create_table_query = '''
SELECT
item_id_1,
item_id_2
FROM (
SELECT
item_id_1,
def create_new_table(spark, table_paths, params, same_category_q):
similarity_table = table_paths["product_similarity"]["table"]
created_table = spark.sql(create_table_query.format(similarity_table=similarity_table,
same_category_q=same_category_q,
num_items=params["num_items"]))
# Write table to some path
created_table.coalesce(1).write.save(table_paths["created_table"]["path"],
'''
SELECT
s.item_id_1,
s.item_id_2,
s.similarity_score
FROM product_similarity s
{same_category_q}
'''.format(same_category_q='') # Depends on value of same_category boolean
SELECT
s.item_id_1,
s.item_id_2,
s.similarity_score
FROM (
SELECT
s.item_id_1,
s.item_id_2,
s.similarity_score,
ROW_NUMBER() OVER(PARTITION BY item_id_1 ORDER BY similarity_score DESC) as row_num
SELECT
s.item_id_1,
s.item_id_2,
s.similarity_score
FROM product_similarity s
INNER JOIN products p
ON s.item_id_1 = p.item_id
INNER JOIN products q
ON s.item_id_2 = q.item_id
WHERE s.item_id_1 != s.item_id_2