Skip to content

Instantly share code, notes, and snippets.

View tomasonjo's full-sized avatar
🏠
Working from home

Tomaz Bratanic tomasonjo

🏠
Working from home
View GitHub Profile
def extract_keywords(text):
"""
Extract keywords and construct them back from tokens
"""
result = list()
keyword = ""
for token in nlp(text):
if token['entity'] == 'I-KEY':
keyword += token['word'][2:] if \
token['word'].startswith("##") else f" {token['word']}"
tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained(
"yanekyuk/bert-uncased-keyword-extractor"
)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
last_name count
Frey 91
Targaryen 66
Stark 51
Lannister 30
Hightower 28
Velaryon 21
Baratheon 21
Greyjoy 19
Rivers 15
componentId componentSize
5 785
457 19
111 12
938 11
193 10
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
0;23.846;0;0
23.846;0;23.846;0
0;23.846;0;23.846
0;0;23.846;0
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
tsne_data = merged_df.drop(["nodeId", "communityId"], axis=1).values.tolist()
scaler = StandardScaler()
scaler.fit(tsne_data)
tsne = TSNE(
n_components=2, n_iter=500, random_state=42, perplexity=50, learning_rate=20
community_4 = (
merged_df[merged_df["communityId"] == 4]
.drop("communityId", axis=1)
.melt(id_vars="nodeId")
)
sns.catplot(
data=community_4, y="value", col="variable", col_wrap=3, kind="boxen", sharey=False
)
sns.catplot(
col="communityId",
y="weightedOutdegree",
height=6,
data=merged_df,
kind="boxen",
col_wrap=3,
)
merged_df = pivot_features_df.merge(kmeans_df, on="nodeId")
merged_df.drop("nodeId", axis=1).groupby("communityId").size().to_frame(
"communitySize"
).reset_index()
kmeans_df = gds.alpha.kmeans.stream(
largestComponentGraph, nodeProperty="features", k=6, randomSeed=42
)