Skip to content

Instantly share code, notes, and snippets.

View alabrashJr's full-sized avatar

Abdurrahman Beyaz alabrashJr

View GitHub Profile
@alabrashJr
alabrashJr / load_bin_vec
Created March 21, 2019 13:53
load vector of pre-trained embedded word from pre-trained binary file like google_w2v.bin
#fname: the file name of binary file <google_w2v.bin>
#vocab: vocabulary dictionary
function load_bin_vec(fname, vocab)
pc(s)=return convert(Char,s[1])
word_vecs = Dict()
open(fname, "r") do f
@show header = readline(f)
vocab_size, layer1_size = map(pf, split(header))
@show binary_len = sizeof(Float32) * layer1_size
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import requests \n",
"from bs4 import BeautifulSoup\n",
@alabrashJr
alabrashJr / clean_timesofindia.py
Created September 3, 2019 13:27
this is a script to clean and parse times of india news html files to text files
#this script was written by Cigil and edit by aalabrash18@ku.edu.tr
import pandas as pd
from lxml import etree
import re
import os, sys, io, traceback, codecs
from bs4 import BeautifulSoup, Comment
UNESCAPE = True
import html as h
import glob

Çoklu Etiket Metin Sınıflandırma

Sınıflandırma Yöntemleri

https://unsplash.com/photos/VISPUxoCwx4

köpek midir? hangi etkit ? hangi etiketler?
Evet/Hayırikili köpek/doğa/çimençok sınıflı köpek,doğa,çimen****çoklu etiket
def predict(text):
print("text -> ",text)
text_s=[stemmer.stem(w) for w in text.split()]
print("text stemmed-> ",text_s)
X=vectorizer.transform([text_s])
X_pred=lp_classifier.predict(X).toarray()
return [data.columns[i] for i,x in enumerate(X_pred[0]) if x==1]
text="araçım servise son getirmediğimde düzgün tamir edilmedi memnun değil"
!pip install scikit-multilearn
!pip install scikit-learn
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import f1_score,accuracy_score
lp_classifier = LabelPowerset(LogisticRegression())
lp_classifier.fit(X_train, y_train)
lp_predictions = lp_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,lp_predictions))
print("F1 score = ",f1_score(y_test,lp_predictions, average="micro"))
labels=["Fiyat","Geri Bildirim","Kullanım","Tavır ve davranış","Kalite"]
y_train=train[labels]
y_test=test[labels]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
X_train = vectorizer.transform(train_text)
x_test = vectorizer.transform(test_text)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['TurkishStemmer'].values.astype('U')
test_text = test['TurkishStemmer'].values.astype('U')
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
#!pip install TurkishStemmer
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()
tokenizer = RegexpTokenizer(r'\w+')
punct_re=lambda x :" ".join(tokenizer.tokenize(x.lower()))
def stemmer_char(text,i):
return " ".join([word[:i] for word in word_tokenize(text)])