Skip to content

Instantly share code, notes, and snippets.

View dkav9's full-sized avatar

Dmitriy dkav9

View GitHub Profile
def get_documents(urls):
"""
Takes a list of urls, gets unique sentences from each website,
and returns all of those sentences in a single list
"""
documents = []
for url in tqdm(urls):
sentences = get_sentences(url)
new_sentences = remove_similar_sentences(sentences, model, similarity_threshold)
documents.append(new_sentences)
def remove_similar_sentences(sentences, model, similarity_threshold):
"""
model: the nlp model to use from the sentence transformer library
sentences: a list of sentences from a website
Returns a list of new sentences which are not too similar to each other
"""
new_sentences = sentences.copy()
# Compute embeddings
embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False)
# Compute cosine-similarities for each sentence with each other sentence
def get_sentences(url):
"""
Takes a url and outputs a list of sentences from the text of the website
"""
try:
article = Article(url)
article.download()
article.parse()
except:
pass
from sentence_transformers import SentenceTransformer, util
from googlesearch import search
from newspaper import Article
from bertopic import BERTopic
from nltk import tokenize
from tqdm import tqdm
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
def scrape_urls(query, num):
"""
Takes a search query and returns links to websites of interest from google
"""
# Get urls from google search
urls = list(search(query, stop=num))
# Trip advisor urls don't open properly so we omit them
urls = [url for url in urls if "tripadvisor" not in url]
return urls
@dkav9
dkav9 / app.py
Created July 7, 2019 04:38
dash application for serving up and visualizing data from lambda API
import dash
from dash.dependencies import Input, Output
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import plotly.figure_factory as ff
import pandas as pd
import scipy
from chalice import Chalice
import boto3
import json
import pandas as pd
import numpy as np
app = Chalice(app_name="Lambda4")
"""API for serving aggregated data"""
@app.route("/")
from chalice import Chalice
import boto3
import json
import pandas as pd
from datetime import datetime
app = Chalice(app_name="Lambda3")
"""On SNS message from Lambda2, aggregate sentiment data and store it"""
@app.on_sns_message(topic='Lambda2Event')
@dkav9
dkav9 / Transform.py
Last active September 21, 2021 12:26
from chalice import Chalice
import boto3
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from datetime import datetime
app = Chalice(app_name='Lambda2')
"""On SNS message from Lambda1, tranform data by dropping duplicates
and get sentiment"""
import imp
import sys
sys.modules["sqlite"] = imp.new_module("sqlite")
sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
from chalice import Chalice, Rate
from finviz.main_func import get_news
from newspaper import Article
import boto3
import json