David Lenz davidlenz

## textblob_de_lemmatizer.py
from textblob_de import TextBlobDE as TextBlob

def textblob_lemmatizer(doclist):
    """Takes a list of strings as input and returns a list of lemmatized strings"""
    docs=[]
    for doc in doclist:
        blob = TextBlob(doc)
        docs.append(' '.join(list(blob.words.lemmatize())))

    return docs

## spacy_lemmatizer.py
import spacy

settings.LEMMATIZER_BATCH_SIZE = 250
settings.LEMMATIZER_N_THREADS = -1

nlp = spacy.load('de')
nlp.disable_pipes('tagger', 'ner')

def spacy_lemmatizer(text, nlp):
    """text is a list of string. nlp is a spacy nlp object. Use nlp.disable_pipes('tagger','ner') to speed up lemmatization"""

## stopwords.py


import stop_words
from langdetect import detect
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import ast


## jensen-shannon-divergence.py
import numpy as np
from scipy.stats import entropy

def js(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2

## attention_lstm.py
class AttentionLSTM(LSTM):
    """LSTM with attention mechanism

    This is an LSTM incorporating an attention mechanism into its hidden states.
    Currently, the context vector calculated from the attended vector is fed
    into the model's internal states, closely following the model by Xu et al.
    (2016, Sec. 3.1.2), using a soft attention model following
    Bahdanau et al. (2014).

    The layer expects two inputs instead of the usual one:

## selenium_google_scrape.py
# https://gist.github.com/azam-a/32b89944b98a3fd79d44ebfdac16b63d
import pandas as pd

import selenium
print('selenium.__version__: ', selenium.__version__)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## twitter_scraper.py
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sqlite3
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from unidecode import unidecode
import time

analyzer = SentimentIntensityAnalyzer()

## reddit_submissions_stream.py
import newsapi_v2
import findurls
import praw
import pandas as pd
import utils_func
import os
import subreddit
import requests
from newspaper import fulltext

## find_urls_in_text.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
url matching regex
http://daringfireball.net/2010/07/improved_regex_for_matching_urls
"""

"""
The regex patterns in this gist are intended to match any URLs,

## scrape_newsapi.py
import justext, time
import pandas as pd
import requests, urllib
import utils_func


def get_sources(key):
    """
    retrieve all sources from newsapi, filter the german and english speaking
    and return them as dataframe
	from textblob_de import TextBlobDE as TextBlob

	def textblob_lemmatizer(doclist):
	"""Takes a list of strings as input and returns a list of lemmatized strings"""
	docs=[]
	for doc in doclist:
	blob = TextBlob(doc)
	docs.append(' '.join(list(blob.words.lemmatize())))

	return docs
	import spacy

	settings.LEMMATIZER_BATCH_SIZE = 250
	settings.LEMMATIZER_N_THREADS = -1

	nlp = spacy.load('de')
	nlp.disable_pipes('tagger', 'ner')

	def spacy_lemmatizer(text, nlp):
	"""text is a list of string. nlp is a spacy nlp object. Use nlp.disable_pipes('tagger','ner') to speed up lemmatization"""


	import stop_words
	from langdetect import detect
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	import ast
	import numpy as np
	from scipy.stats import entropy

	def js(p, q):
	p = np.asarray(p)
	q = np.asarray(q)
	# normalize
	p /= p.sum()
	q /= q.sum()
	m = (p + q) / 2
	class AttentionLSTM(LSTM):
	"""LSTM with attention mechanism

	This is an LSTM incorporating an attention mechanism into its hidden states.
	Currently, the context vector calculated from the attended vector is fed
	into the model's internal states, closely following the model by Xu et al.
	(2016, Sec. 3.1.2), using a soft attention model following
	Bahdanau et al. (2014).

	The layer expects two inputs instead of the usual one:
	# https://gist.github.com/azam-a/32b89944b98a3fd79d44ebfdac16b63d
	import pandas as pd

	import selenium
	print('selenium.__version__: ', selenium.__version__)

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from tweepy import Stream
	from tweepy import OAuthHandler
	from tweepy.streaming import StreamListener
	import json
	import sqlite3
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from unidecode import unidecode
	import time

	analyzer = SentimentIntensityAnalyzer()
	import newsapi_v2
	import findurls
	import praw
	import pandas as pd
	import utils_func
	import os
	import subreddit
	import requests
	from newspaper import fulltext
	#!/usr/bin/python
	# -- coding: utf-8 --

	"""
	url matching regex
	http://daringfireball.net/2010/07/improved_regex_for_matching_urls
	"""

	"""
	The regex patterns in this gist are intended to match any URLs,
	import justext, time
	import pandas as pd
	import requests, urllib
	import utils_func


	def get_sources(key):
	"""
	retrieve all sources from newsapi, filter the german and english speaking
	and return them as dataframe