Skip to content

Instantly share code, notes, and snippets.

View astoeckl's full-sized avatar

Andreas Stöckl astoeckl

  • FH OÖ
  • Hagenberg
View GitHub Profile
import pandas as pd
from transformers import GPT2TokenizerFast
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
print(list(newsgroups_train.target_names))
print(newsgroups_train.data[:3])
print(newsgroups_train.target[:3])
df_news = pd.DataFrame(list(newsgroups_train.data), columns=['Text'])
import openai
openai.api_key = XXX-YOURKEY
prompt = “Here the command in natural language is formulated”
response = openai.Completion.create(
engine="davinci-codex",
prompt= prompt,
temperature=0,
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
# load the dataset into a dataframe
df = pd.DataFrame(fetch_20newsgroups().data, columns=['text'])
# print the first 10 rows
df.head(10)
"""
import gensim
from gensim import corpora, models
# Tokenize the documents
tokenized_docs = [doc.split() for doc in df['text']]
# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(tokenized_docs)
# Create a corpus from the tokenized documents
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
dfmai = pd.read_csv("artikel.csv", sep = ";", encoding='utf-8')
orte = pd.read_csv("ortsliste.csv", sep = ";", encoding='utf-8')
ortliste = list(orte["Unnamed: 3"][3:-2])
plzliste = list(orte["Unnamed: 4"][3:-2])
dforte = dfmai[["title","body","published_at","domain","words_count"]]
@astoeckl
astoeckl / leadprediction
Last active November 13, 2021 13:30
leadprediction
# Importing libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
#Import data
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
from sklearn.manifold import TSNE
def tsnescatterplot(model, word_labels):
arr = np.empty((0,100), dtype='f')
model.wv.similar_by_word('d4')
%%time
import gensim
from gensim.models import Word2Vec
model = Word2Vec(sentences=df, size=100, window=3, workers=4)
model.save("word2vec.model")