Thiago Coelho Vieira tcvieira

## missing.py
# Filling in NaN values of a particular feature variable
avg_height = 67 # Maybe this is a good number
data["height"] = data["height"].fillna(avg_height)

# Filling in NaN values with a calculated one
avg_height = data["height"].median() # This is probably more accurate
data["height"] = data["height"].fillna(avg_height)

# Dropping rows with missing values
# Here we check which rows of "height" aren't null

## format.py
# Formattinng data
data['state'] = data['state'].str.upper() # Capitalize the whole thing
data['state'] = data['state'].replace( # Changing the format of the string
                                      to_replace=["CA", "C.A", "CALI"],
                                      value=["CALIFORNIA", "CALIFORNIA", "CALIFORNIA"])

# Dates and times are quite common in large datasets
# Converting all strings to datetime objects is good standardisation practice
# Here, the data["time"] strings will look like "2019-01-15", which is exactly
# how we set the "format" variable below

## setupFastaiV1.md

      
              1 file
            
          
              1 fork
            
          
              6 comments
            
          
              6 stars
            
          
                tcvieira
                / setupFastaiV1.md
            
            
              Last active
              October 12, 2021 13:10
            
              
                Setup Fast.ai v1 on Paperspace Fast.ai Template
              
          
    Setup Fastai v1 on Paperspace

Machine


Create a Fast.ai machine from public templates w/ P4000 and public IP

Connect to the machine


$ source deactivate fastai
$ pip install virtualenv


## speed_up.py
import numpy as np
import multiprocessing as multi

def chunks(n, page_list):
    """Splits the list into n chunks"""
    return np.array_split(page_list,n)

cpus = multi.cpu_count()
workers = []
page_list = ['www.website.com/page1.html', 'www.website.com/page2.html'

## min-char-rnn.py
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
	# Filling in NaN values of a particular feature variable
	avg_height = 67 # Maybe this is a good number
	data["height"] = data["height"].fillna(avg_height)

	# Filling in NaN values with a calculated one
	avg_height = data["height"].median() # This is probably more accurate
	data["height"] = data["height"].fillna(avg_height)

	# Dropping rows with missing values
	# Here we check which rows of "height" aren't null
	# Formattinng data
	data['state'] = data['state'].str.upper() # Capitalize the whole thing
	data['state'] = data['state'].replace( # Changing the format of the string
	to_replace=["CA", "C.A", "CALI"],
	value=["CALIFORNIA", "CALIFORNIA", "CALIFORNIA"])

	# Dates and times are quite common in large datasets
	# Converting all strings to datetime objects is good standardisation practice
	# Here, the data["time"] strings will look like "2019-01-15", which is exactly
	# how we set the "format" variable below
	import numpy as np
	import multiprocessing as multi

	def chunks(n, page_list):
	"""Splits the list into n chunks"""
	return np.array_split(page_list,n)

	cpus = multi.cpu_count()
	workers = []
	page_list = ['www.website.com/page1.html', 'www.website.com/page2.html'
	"""
	Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
	BSD License
	"""
	import numpy as np

	# data I/O
	data = open('input.txt', 'r').read() # should be simple plain text file
	chars = list(set(data))
	data_size, vocab_size = len(data), len(chars)