Last active
November 21, 2017 10:22
-
-
Save Audhil/e09dd2eb48f99d62cc7f675169901dac to your computer and use it in GitHub Desktop.
Gist to copy data from csv to .txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
It read files from csv file and separates "labels" and "texts" from it. | |
""" | |
import pandas as pd | |
import os | |
import numpy as np | |
import string | |
def run(): | |
""" | |
run to separate rows | |
:return: | |
""" | |
file_sep_dir = os.getcwd() | |
msg_label_dir = os.getcwd() + '/msg_labels' | |
msg_label_file = msg_label_dir + '/msg_labels_file.txt' | |
msg_texts_dir = os.getcwd() + '/msg_texts' | |
msg_texts_file = msg_texts_dir + '/msg_texts_file.txt' | |
if not os.path.isfile(file_sep_dir + "/temp_spam_data.csv"): | |
print('---file not found!') | |
return None | |
data = pd.read_csv('temp_spam_data.csv', sep=",") | |
labels = data[['label']] | |
texts = data[['text']] | |
# turning them to 0. & 1. | |
labels = [1. if label == 'spam' else 0. for label in labels.values] | |
# saving labels | |
if not os.path.isdir(msg_label_dir): | |
os.mkdir(msg_label_dir) | |
np.savetxt(msg_label_file, labels, fmt='%f') | |
print('---saving labels done!') | |
# saving texts | |
texts = [np.array_str(text) for text in texts.values] | |
texts = [x.lower() for x in texts] | |
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] | |
texts = [''.join(c for c in x if c not in '0123456789') for x in texts] | |
texts = [' '.join(x.split()) for x in texts] | |
if not os.path.isdir(msg_texts_dir): | |
os.mkdir(msg_texts_dir) | |
np.savetxt(msg_texts_file, texts, delimiter=' ', fmt='%s') | |
print('---saving texts done!') | |
print('---all done!') | |
if __name__ == '__main__': | |
run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0.000000 | |
0.000000 | |
1.000000 | |
0.000000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat | |
ok lar joking wif u oni | |
free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs | |
u dun say so early hor u c already then say |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
label | text | |
---|---|---|
ham | Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... | |
ham | Ok lar... Joking wif u oni... | |
spam | Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's | |
ham | U dun say so early hor... U c already then say... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment