Skip to content

Instantly share code, notes, and snippets.

View Nempickaxe's full-sized avatar
🤔
print('hi')

Nem_pickaxe Nempickaxe

🤔
print('hi')
View GitHub Profile
@Nempickaxe
Nempickaxe / word2vec.py
Last active September 17, 2020 16:48 — forked from moustaki/Faster save-load for word2vec
Faster save-load for word2vec
import dbm, os
import cPickle as pickle
from gensim.models import Word2Vec
import numpy as np
def save_model(model, directory):
model.init_sims() # making sure syn0norm is initialised
if not os.path.exists(directory):
os.makedirs(directory)
# Saving indexes as DBM'ed dictionary
@Nempickaxe
Nempickaxe / get_grams.py
Created February 26, 2019 09:12
get bigrams and trigrams
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from collections import Counter
@Nempickaxe
Nempickaxe / keybase.md
Created May 26, 2019 06:22
keybase public key

Keybase proof

I hereby claim:

  • I am nempickaxe on github.
  • I am ilaichi (https://keybase.io/ilaichi) on keybase.
  • I have a public key ASC0peYsZX_Z7LwCfPjY9FJz_772TLP9XsoLON6QsTED-go

To claim this, I am signing this object:

def get_lower_tri_heatmap(df, output="cooc_matrix.png"):
mask = np.zeros_like(df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Want diagonal elements as well
mask[np.diag_indices_from(mask)] = False
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
@Nempickaxe
Nempickaxe / text2png.py
Created July 14, 2020 11:54
Convert a text to image using pillow
import textwrap
import PIL
from PIL import ImageFont
from PIL import Image
from PIL import ImageDraw
def text2png(text, fullpath, color = "#000", bgcolor = "#FFF", fontfullpath = None, fontsize = 13, leftpadding = 3, rightpadding = 3, width = 2000):
REPLACEMENT_CHARACTER = '\uFFFD'
NEWLINE_REPLACEMENT_STRING = ' ' + REPLACEMENT_CHARACTER + ' '
@Nempickaxe
Nempickaxe / preprocessing_steps.py
Created September 17, 2020 16:38
nltk preprocessing function
import re
import nltk
import emoji
from nltk.tokenize import word_tokenize
def tokenize(corpus):
data = re.sub(r'[,!?;-]+', '.', corpus)
data = nltk.word_tokenize(data) # tokenize string to words
data = [ ch.lower() for ch in data
if ch.isalpha()
import re
import yaml
def parse_config(vars_dict, path=None, data=None, tag='!ENV'):
"""
Load a yaml configuration file and resolve any environment variables
The environment variables must have !ENV before them and be in this format
to be parsed: $<VAR_NAME>.
E.g.:
database:
@Nempickaxe
Nempickaxe / split_text_max_width.py
Last active February 11, 2021 15:52
split a sentence based on maximum character width of sentences
def get_interval(space_list, width):
for i in range(len(space_list)-1):
if space_list[i+1]>width:
return space_list[i]
else:
continue
return space_list[-1]
def get_subtracted_list(space_list, width):
return list(map(lambda x: int(((x-width)+abs(x-width))/2), space_list))
@Nempickaxe
Nempickaxe / get_size_variable.py
Created August 25, 2021 13:55
Get memory requirement for a variable
import sys
from types import ModuleType, FunctionType
from gc import get_referents
# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType
@Nempickaxe
Nempickaxe / read_mongo.py
Created September 9, 2021 07:57
Read from Mongo
def read_mongo_collection(uri, pipeline=None, given_schema=None, spark=None):
"""
:param uri: uri for mongo connection
:param pipeline: pipeline option for pushing queries to mongo
:param given_schema: schema option, will read in mentioned schema
:return: dataframe after reading from mongo
"""
if pipeline:
if not given_schema:
return spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline).option(