This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from textblob_de import TextBlobDE as TextBlob | |
def textblob_lemmatizer(doclist): | |
"""Takes a list of strings as input and returns a list of lemmatized strings""" | |
docs=[] | |
for doc in doclist: | |
blob = TextBlob(doc) | |
docs.append(' '.join(list(blob.words.lemmatize()))) | |
return docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
settings.LEMMATIZER_BATCH_SIZE = 250 | |
settings.LEMMATIZER_N_THREADS = -1 | |
nlp = spacy.load('de') | |
nlp.disable_pipes('tagger', 'ner') | |
def spacy_lemmatizer(text, nlp): | |
"""text is a list of string. nlp is a spacy nlp object. Use nlp.disable_pipes('tagger','ner') to speed up lemmatization""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import stop_words | |
from langdetect import detect | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
import ast | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import entropy | |
def js(p, q): | |
p = np.asarray(p) | |
q = np.asarray(q) | |
# normalize | |
p /= p.sum() | |
q /= q.sum() | |
m = (p + q) / 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class AttentionLSTM(LSTM): | |
"""LSTM with attention mechanism | |
This is an LSTM incorporating an attention mechanism into its hidden states. | |
Currently, the context vector calculated from the attended vector is fed | |
into the model's internal states, closely following the model by Xu et al. | |
(2016, Sec. 3.1.2), using a soft attention model following | |
Bahdanau et al. (2014). | |
The layer expects two inputs instead of the usual one: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://gist.github.com/azam-a/32b89944b98a3fd79d44ebfdac16b63d | |
import pandas as pd | |
import selenium | |
print('selenium.__version__: ', selenium.__version__) | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tweepy import Stream | |
from tweepy import OAuthHandler | |
from tweepy.streaming import StreamListener | |
import json | |
import sqlite3 | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
from unidecode import unidecode | |
import time | |
analyzer = SentimentIntensityAnalyzer() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import newsapi_v2 | |
import findurls | |
import praw | |
import pandas as pd | |
import utils_func | |
import os | |
import subreddit | |
import requests | |
from newspaper import fulltext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
url matching regex | |
http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
""" | |
""" | |
The regex patterns in this gist are intended to match any URLs, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import justext, time | |
import pandas as pd | |
import requests, urllib | |
import utils_func | |
def get_sources(key): | |
""" | |
retrieve all sources from newsapi, filter the german and english speaking | |
and return them as dataframe |