Nem_pickaxe Nempickaxe

## word2vec.py
import dbm, os
import cPickle as pickle
from gensim.models import Word2Vec
import numpy as np

def save_model(model, directory):
    model.init_sims() # making sure syn0norm is initialised
    if not os.path.exists(directory):
        os.makedirs(directory)
    # Saving indexes as DBM'ed dictionary

## get_grams.py
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

from collections import Counter

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Nempickaxe
                / keybase.md
            
            
              Created
              May 26, 2019 06:22
            
              
                keybase public key
              
          
    Keybase proof

I hereby claim:

I am nempickaxe on github.
I am ilaichi (https://keybase.io/ilaichi) on keybase.
I have a public key ASC0peYsZX_Z7LwCfPjY9FJz_772TLP9XsoLON6QsTED-go

To claim this, I am signing this object:

  
## create_lower_triangle_heatmap.py
def get_lower_tri_heatmap(df, output="cooc_matrix.png"):
    mask = np.zeros_like(df, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Want diagonal elements as well
    mask[np.diag_indices_from(mask)] = False

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

## text2png.py
import textwrap
import PIL
from PIL import ImageFont
from PIL import Image
from PIL import ImageDraw

def text2png(text, fullpath, color = "#000", bgcolor = "#FFF", fontfullpath = None, fontsize = 13, leftpadding = 3, rightpadding = 3, width = 2000):
    REPLACEMENT_CHARACTER = '\uFFFD'
    NEWLINE_REPLACEMENT_STRING = ' ' + REPLACEMENT_CHARACTER + ' '

## preprocessing_steps.py
import re
import nltk
import emoji
from nltk.tokenize import word_tokenize

def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()

## add_args_yaml_file.py
import re
import yaml

def parse_config(vars_dict,  path=None, data=None, tag='!ENV'):
    """
    Load a yaml configuration file and resolve any environment variables
    The environment variables must have !ENV before them and be in this format
    to be parsed: $<VAR_NAME>.
    E.g.:
    database:

## split_text_max_width.py
def get_interval(space_list, width):
    for i in range(len(space_list)-1):
        if space_list[i+1]>width:
            return space_list[i]
        else:
            continue
    return space_list[-1]

def get_subtracted_list(space_list, width):
    return list(map(lambda x: int(((x-width)+abs(x-width))/2), space_list))

## get_size_variable.py
import sys
from types import ModuleType, FunctionType
from gc import get_referents

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


## read_mongo.py
def read_mongo_collection(uri, pipeline=None, given_schema=None, spark=None):
    """
    :param uri: uri for mongo connection
    :param pipeline: pipeline option for pushing queries to mongo
    :param given_schema: schema option, will read in mentioned schema
    :return: dataframe after reading from mongo
    """
    if pipeline:
        if not given_schema:
            return spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline).option(
	import dbm, os
	import cPickle as pickle
	from gensim.models import Word2Vec
	import numpy as np

	def save_model(model, directory):
	model.init_sims() # making sure syn0norm is initialised
	if not os.path.exists(directory):
	os.makedirs(directory)
	# Saving indexes as DBM'ed dictionary
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	from nltk.collocations import BigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.corpus import stopwords
	nltk.download('stopwords')
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import TrigramAssocMeasures

	from collections import Counter
	def get_lower_tri_heatmap(df, output="cooc_matrix.png"):
	mask = np.zeros_like(df, dtype=np.bool)
	mask[np.triu_indices_from(mask)] = True

	# Want diagonal elements as well
	mask[np.diag_indices_from(mask)] = False

	# Set up the matplotlib figure
	f, ax = plt.subplots(figsize=(11, 9))
	import textwrap
	import PIL
	from PIL import ImageFont
	from PIL import Image
	from PIL import ImageDraw

	def text2png(text, fullpath, color = "#000", bgcolor = "#FFF", fontfullpath = None, fontsize = 13, leftpadding = 3, rightpadding = 3, width = 2000):
	REPLACEMENT_CHARACTER = '\uFFFD'
	NEWLINE_REPLACEMENT_STRING = ' ' + REPLACEMENT_CHARACTER + ' '
	import re
	import nltk
	import emoji
	from nltk.tokenize import word_tokenize

	def tokenize(corpus):
	data = re.sub(r'[,!?;-]+', '.', corpus)
	data = nltk.word_tokenize(data) # tokenize string to words
	data = [ ch.lower() for ch in data
	if ch.isalpha()
	import re
	import yaml

	def parse_config(vars_dict, path=None, data=None, tag='!ENV'):
	"""
	Load a yaml configuration file and resolve any environment variables
	The environment variables must have !ENV before them and be in this format
	to be parsed: $<VAR_NAME>.
	E.g.:
	database:
	def get_interval(space_list, width):
	for i in range(len(space_list)-1):
	if space_list[i+1]>width:
	return space_list[i]
	else:
	continue
	return space_list[-1]

	def get_subtracted_list(space_list, width):
	return list(map(lambda x: int(((x-width)+abs(x-width))/2), space_list))
	import sys
	from types import ModuleType, FunctionType
	from gc import get_referents

	# Custom objects know their class.
	# Function objects seem to know way too much, including modules.
	# Exclude modules as well.
	BLACKLIST = type, ModuleType, FunctionType
	def read_mongo_collection(uri, pipeline=None, given_schema=None, spark=None):
	"""
	:param uri: uri for mongo connection
	:param pipeline: pipeline option for pushing queries to mongo
	:param given_schema: schema option, will read in mentioned schema
	:return: dataframe after reading from mongo
	"""
	if pipeline:
	if not given_schema:
	return spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline).option(