This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from transformers import GPT2TokenizerFast | |
from sklearn.datasets import fetch_20newsgroups | |
newsgroups_train = fetch_20newsgroups(subset='train') | |
print(list(newsgroups_train.target_names)) | |
print(newsgroups_train.data[:3]) | |
print(newsgroups_train.target[:3]) | |
df_news = pd.DataFrame(list(newsgroups_train.data), columns=['Text']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openai | |
openai.api_key = XXX-YOURKEY | |
prompt = “Here the command in natural language is formulated” | |
response = openai.Completion.create( | |
engine="davinci-codex", | |
prompt= prompt, | |
temperature=0, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.datasets import fetch_20newsgroups | |
# load the dataset into a dataframe | |
df = pd.DataFrame(fetch_20newsgroups().data, columns=['text']) | |
# print the first 10 rows | |
df.head(10) | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
from gensim import corpora, models | |
# Tokenize the documents | |
tokenized_docs = [doc.split() for doc in df['text']] | |
# Create a dictionary from the tokenized documents | |
dictionary = corpora.Dictionary(tokenized_docs) | |
# Create a corpus from the tokenized documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import warnings | |
warnings.filterwarnings('ignore') | |
dfmai = pd.read_csv("artikel.csv", sep = ";", encoding='utf-8') | |
orte = pd.read_csv("ortsliste.csv", sep = ";", encoding='utf-8') | |
ortliste = list(orte["Unnamed: 3"][3:-2]) | |
plzliste = list(orte["Unnamed: 4"][3:-2]) | |
dforte = dfmai[["title","body","published_at","domain","words_count"]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing libraries | |
import pandas as pd | |
import statsmodels.api as sm | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.feature_selection import RFE | |
#Import data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
plt.rcParams["figure.figsize"] = (20,10) | |
from sklearn.manifold import TSNE | |
def tsnescatterplot(model, word_labels): | |
arr = np.empty((0,100), dtype='f') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.wv.similar_by_word('d4') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.wv['d4'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
import gensim | |
from gensim.models import Word2Vec | |
model = Word2Vec(sentences=df, size=100, window=3, workers=4) | |
model.save("word2vec.model") |