Plegas Gerasimos makispl

## classify_spam_sms.py
# classify a new message, coming from advertising content
sms_classify('''Hey, Sign up with this promo code and get your card for amazing
                exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN''')

## classify_ham_sms.py
# classify a new message, coming from a private thread
sms_classify('''Okey Stan! Seems to be a reasonable amount of money. I'll think
                of it and let you know ASAP.''')

## classify_test_set.py
# Classify the messages of the test_set
test_set['sms_predicted'] = test_set['SMS'].apply(sms_classify_test_set)

# Calculate the accuracy of the algorithm
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['sms_predicted']:

## custom_normalization.py
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
training_set['SMS'] = training_set['SMS'].str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'£|\$', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\d+(\.\d+)?', ' ')

# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ')

## lemmatization.py
lemmatizer = nltk.stem.WordNetLemmatizer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
    lemmatizer.lemmatize(term, pos='v') for term in x.split())
                                               )

## stemming.py
porter = nltk.PorterStemmer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
    porter.stem(term) for term in x.split())
                                               )

## tokenization.py
training_set['SMS'] = training_set['SMS'].apply(lambda sms: nltk.word_tokenize(sms))

## vectorization.py
# Create the corpus
corpus = training_set['SMS'].sum()

# Create the vocabulary
temp_set = set(corpus)
vocabulary = list(temp_set)

# Create a dictionary
len_training_set = len(training_set['SMS'])
word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in vocabulary}

## classify_message.py
def sms_classify(message):
    '''
    Takes in as input a new sms (w1, w2, ..., wn),
    calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
    compares them and outcomes whether the message is spam or not.
    '''

    # Replace addresses (hhtp, email), numbers (plain, phone), money symbols
    message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
    message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')

## ds-project-organization.md

      
        
          
            
              
              1 file
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              0 stars
            
          
        
        
          
              
          
          
            
                makispl
                / ds-project-organization.md
            
            
              Created
              January 7, 2021 19:54
                — forked from ericmjl/ds-project-organization.md
            
              
                How to organize your Python data science project
              
          
        
      
        
  
      
    How to organize your Python data science project

Having done a number of data projects over the years, and having seen a number of them up on GitHub, I've come to see that there's a wide range in terms of how "readable" a project is. I'd like to share some practices that I have come to adopt in my projects, which I hope will bring some organization to your projects.
Disclaimer: I'm hoping nobody takes this to be "the definitive guide" to organizing a data project; rather, I hope you, the reader, find useful tips that you can adapt to your own projects.
Disclaimer 2: What I’m writing below is primarily geared towards Python language users. Some ideas may be transferable to other languages; others may not be so. Please feel free to remix whatever you see here!
Disclaimer 3: I found the Cookiecutter Data Science page after finishing this blog post. Many ideas overlap here, though some directories are irrelevant in my work -- which is to
	# classify a new message, coming from advertising content
	sms_classify('''Hey, Sign up with this promo code and get your card for amazing
	exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN''')
	# classify a new message, coming from a private thread
	sms_classify('''Okey Stan! Seems to be a reasonable amount of money. I'll think
	of it and let you know ASAP.''')
	# Classify the messages of the test_set
	test_set['sms_predicted'] = test_set['SMS'].apply(sms_classify_test_set)

	# Calculate the accuracy of the algorithm
	correct = 0
	total = test_set.shape[0]

	for row in test_set.iterrows():
	row = row[1]
	if row['Label'] == row['sms_predicted']:
	# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
	training_set['SMS'] = training_set['SMS'].str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
	training_set['SMS'] = training_set['SMS'].str.replace(r'(http[s]?\S+)\|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
	training_set['SMS'] = training_set['SMS'].str.replace(r'£\|\$', ' ')
	training_set['SMS'] = training_set['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
	training_set['SMS'] = training_set['SMS'].str.replace(r'\d+(\.\d+)?', ' ')

	# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
	training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ')
	training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ')
	lemmatizer = nltk.stem.WordNetLemmatizer()
	training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
	lemmatizer.lemmatize(term, pos='v') for term in x.split())
	)
	porter = nltk.PorterStemmer()
	training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
	porter.stem(term) for term in x.split())
	)
	# Create the corpus
	corpus = training_set['SMS'].sum()

	# Create the vocabulary
	temp_set = set(corpus)
	vocabulary = list(temp_set)

	# Create a dictionary
	len_training_set = len(training_set['SMS'])
	word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in vocabulary}
	def sms_classify(message):
	'''
	Takes in as input a new sms (w1, w2, ..., wn),
	calculates P(Spam\|w1, w2, ..., wn) and P(Ham\|w1, w2, ..., wn),
	compares them and outcomes whether the message is spam or not.
	'''

	# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
	message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
	message = message.replace(r'(http[s]?\S+)\|(\w+\.[A-Za-z]{2,4}\S*)', ' ')