This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_documents(urls): | |
| """ | |
| Takes a list of urls, gets unique sentences from each website, | |
| and returns all of those sentences in a single list | |
| """ | |
| documents = [] | |
| for url in tqdm(urls): | |
| sentences = get_sentences(url) | |
| new_sentences = remove_similar_sentences(sentences, model, similarity_threshold) | |
| documents.append(new_sentences) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def remove_similar_sentences(sentences, model, similarity_threshold): | |
| """ | |
| model: the nlp model to use from the sentence transformer library | |
| sentences: a list of sentences from a website | |
| Returns a list of new sentences which are not too similar to each other | |
| """ | |
| new_sentences = sentences.copy() | |
| # Compute embeddings | |
| embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False) | |
| # Compute cosine-similarities for each sentence with each other sentence |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_sentences(url): | |
| """ | |
| Takes a url and outputs a list of sentences from the text of the website | |
| """ | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| except: | |
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sentence_transformers import SentenceTransformer, util | |
| from googlesearch import search | |
| from newspaper import Article | |
| from bertopic import BERTopic | |
| from nltk import tokenize | |
| from tqdm import tqdm | |
| import nltk | |
| import re | |
| nltk.download('punkt') | |
| nltk.download('stopwords') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def scrape_urls(query, num): | |
| """ | |
| Takes a search query and returns links to websites of interest from google | |
| """ | |
| # Get urls from google search | |
| urls = list(search(query, stop=num)) | |
| # Trip advisor urls don't open properly so we omit them | |
| urls = [url for url in urls if "tripadvisor" not in url] | |
| return urls |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dash | |
| from dash.dependencies import Input, Output | |
| import dash_table | |
| import dash_core_components as dcc | |
| import dash_html_components as html | |
| import dash_bootstrap_components as dbc | |
| import plotly.figure_factory as ff | |
| import pandas as pd | |
| import scipy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from chalice import Chalice | |
| import boto3 | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| app = Chalice(app_name="Lambda4") | |
| """API for serving aggregated data""" | |
| @app.route("/") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from chalice import Chalice | |
| import boto3 | |
| import json | |
| import pandas as pd | |
| from datetime import datetime | |
| app = Chalice(app_name="Lambda3") | |
| """On SNS message from Lambda2, aggregate sentiment data and store it""" | |
| @app.on_sns_message(topic='Lambda2Event') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from chalice import Chalice | |
| import boto3 | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| import pandas as pd | |
| from datetime import datetime | |
| app = Chalice(app_name='Lambda2') | |
| """On SNS message from Lambda1, tranform data by dropping duplicates | |
| and get sentiment""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import imp | |
| import sys | |
| sys.modules["sqlite"] = imp.new_module("sqlite") | |
| sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2") | |
| from chalice import Chalice, Rate | |
| from finviz.main_func import get_news | |
| from newspaper import Article | |
| import boto3 | |
| import json |