Skip to content

Instantly share code, notes, and snippets.

similarity = embed_model.wv.similarity('sorted', 'ord')
most_similar = embed_model.wv.most_similar('len(x)')
vector = embed_model.wv['for']
from gensim.models import FastText
embed_model = FastText(vector_size=meta_hyper['vector_size'],
window=meta_hyper['window'],
min_count = meta_hyper['min_count'],
alpha= meta_hyper['alpha'],
workers=meta_hyper['CPU'])
embed_model.build_vocab(tokenized_data)
meta_hyper = {
"vector_size": 192, # size of embedding
"alpha": 0.025, # learning rate
"window": 5,
"min_freq" : 5,
"epochs": 300, # number of training epochs
"vocab_size": len(set([*td for td in tokenized_data])), #size of vocabulary
"data_description": "Add your description here",
"data_size": len(tokenized_data),
"tokens_number": len([*td for td in tokenized_data]),
rootdir = "PATH TO THE DIRECTORY OF THE PROJECT YOU CLONED FROM GITHUB"
files_code = load_all_files(root_dir)
tokenized_data = []
for d in files_code:
try:
tokenized_data.append(tokenize_python(d))
except:
print('error')
from nltk.tokenize import word_tokenize
from tokenize import tokenize
from io import BytesIO
rename_globals = {}
def tokenize_python(code):
g = tokenize(BytesIO(code.encode('utf-8')).readline)
try:
tokens = [c[1] for c in g if c[1]!='' and c[1]!='\n'][1:]
def load_all_files(rootdir):
files_code = []
for subdir, dirs, files in os.walk(rootdir):
for file in files:
if file.endswith('.py'):
with open(os.path.join(subdir, file)) as pf:
try:
code = pf.read()
files_code.append(code)
except:
@islem-esi
islem-esi / text_sum.py
Created April 9, 2021 21:19
example for summarization models
import sumy
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
#LSA algorithm
from sumy.summarizers.lsa import LsaSummarizer
#text: text to summarize
#no_sentences: number of sentences in your summary,
#lang: language of text
def lsa_summary(text, no_sentences, lang):
@islem-esi
islem-esi / lstm_opseq.py
Last active February 16, 2021 22:56
lstm for opseq
#do a lot of imports
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
from os import listdir
import os.path
import json
from sklearn.preprocessing import OneHotEncoder
@islem-esi
islem-esi / assembly_groups.py
Created February 16, 2021 22:02
assembly instructions groups
inst_groups = {
#Conditional Data Transfer
'cdt': ['cmove', 'cmovz', 'cmovne', 'cmovnz', 'cmova', 'cmovnbe', 'cmovae', 'cmovnb', 'cmovb',
'cmovnae', 'cmovbe', 'cmovna', 'cmovg',
'cmovnle', 'cmovge', 'cmovnl', 'cmovl', 'cmovnge', 'cmovle', 'cmovng',
'cmovc', 'cmovnc', 'cmovo', 'cmovno', 'cmovs', 'cmovns', 'cmovp', 'cmovpe',
'cmovnp', 'cmovpo',],
#Unconditianl Data Transfer
'udt': ['mov', 'xchg', 'bswap', 'movsx', 'movzx', 'movlps', 'movqda', 'lock xchg'],
#Stack Data Transfer
#Change the parameters to whatever suits you
batch_size = 512
epochs = 100
labels = [0 for _ in benign_images] + [1 for _ in malicious_images]
model.fit(benign_images+malicious_images, labels,
batch_size = batch_size,
epochs = epochs,
validation_split = 0.25,
shuffle = True)