Zoumana Keita keitazoumana

## install tweepy
pip install tweepy

## import modules
import tweepy
import pandas as pd
import nltk
from tweepy import OAuthHandler
from tweepy import Cursor

## preprocess_data.py
"""
Content of preprocess_data.py
"""
from sklearn.model_selection import train_test_split
import pandas as pd

def prepare_data(path_to_data):

    # Read data from path
    data = pd.read_csv(path_to_data)

## train_model.py
"""
Content of train_model.py
"""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

def run_model_training(X_train, X_test, y_train, y_test):
    clf = LogisticRegression()
    clf.fit(X_train,y_train)

## run_training.py
"""
Content of run_training.py
"""

import joblib

# Get customized functions from library
from packages.preprocess_data import *
from packages.train_model import run_model_training

## with_slate.py
import slate3k as slate
text  = slate.PDF(open('./data/obama-worlds-matter.pdf', 'rb')).text()
print(text)

## with_pdfminer_six.py
from pdfminer.high_level import extract_text
text  = extract_text('./data/obama-worlds-matter.pdf', 'rb')
print(text)

## with_pyPDF.py
from PyPDF2 import PdfFileReader

# creating a pdf file object
pdfObject = open('./data/obama-worlds-matter.pdf', 'rb')

# creating a pdf reader object
pdfReader = PdfFileReader(pdfObject)

# Extract and concatenate each page's content
text=''

## with_tabulate.py
from tabula import read_pdf
from tabulate import tabulate
import pandas as pd
import io

# Read the only the page n°6 of the file
food_calories = read_pdf('./data/food_calories.pdf',pages = 6,
                         multiple_tables = True, stream = True)

# Transform the result into a string table format

## with_tabulate_v2.py
import tabula

tabula.convert_into('./data/food_calories.pdf', "./data/food_calories2.csv",
                    output_format="csv", pages = 6)
	import tweepy
	import pandas as pd
	import nltk
	from tweepy import OAuthHandler
	from tweepy import Cursor
	"""
	Content of preprocess_data.py
	"""
	from sklearn.model_selection import train_test_split
	import pandas as pd

	def prepare_data(path_to_data):

	# Read data from path
	data = pd.read_csv(path_to_data)
	"""
	Content of train_model.py
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report
	import pickle

	def run_model_training(X_train, X_test, y_train, y_test):
	clf = LogisticRegression()
	clf.fit(X_train,y_train)
	"""
	Content of run_training.py
	"""

	import joblib

	# Get customized functions from library
	from packages.preprocess_data import *
	from packages.train_model import run_model_training
	import slate3k as slate
	text = slate.PDF(open('./data/obama-worlds-matter.pdf', 'rb')).text()
	print(text)
	from pdfminer.high_level import extract_text
	text = extract_text('./data/obama-worlds-matter.pdf', 'rb')
	print(text)
	from PyPDF2 import PdfFileReader

	# creating a pdf file object
	pdfObject = open('./data/obama-worlds-matter.pdf', 'rb')

	# creating a pdf reader object
	pdfReader = PdfFileReader(pdfObject)

	# Extract and concatenate each page's content
	text=''
	from tabula import read_pdf
	from tabulate import tabulate
	import pandas as pd
	import io

	# Read the only the page n°6 of the file
	food_calories = read_pdf('./data/food_calories.pdf',pages = 6,
	multiple_tables = True, stream = True)

	# Transform the result into a string table format
	import tabula

	tabula.convert_into('./data/food_calories.pdf', "./data/food_calories2.csv",
	output_format="csv", pages = 6)