Skip to content

Instantly share code, notes, and snippets.

@keitazoumana
keitazoumana / install tweepy
Last active November 22, 2021 19:51
Install module
pip install tweepy
@keitazoumana
keitazoumana / import modules
Created November 22, 2021 19:56
Import modules
import tweepy
import pandas as pd
import nltk
from tweepy import OAuthHandler
from tweepy import Cursor
"""
Content of preprocess_data.py
"""
from sklearn.model_selection import train_test_split
import pandas as pd
def prepare_data(path_to_data):
# Read data from path
data = pd.read_csv(path_to_data)
"""
Content of train_model.py
"""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle
def run_model_training(X_train, X_test, y_train, y_test):
clf = LogisticRegression()
clf.fit(X_train,y_train)
"""
Content of run_training.py
"""
import joblib
# Get customized functions from library
from packages.preprocess_data import *
from packages.train_model import run_model_training
import slate3k as slate
text = slate.PDF(open('./data/obama-worlds-matter.pdf', 'rb')).text()
print(text)
from pdfminer.high_level import extract_text
text = extract_text('./data/obama-worlds-matter.pdf', 'rb')
print(text)
from PyPDF2 import PdfFileReader
# creating a pdf file object
pdfObject = open('./data/obama-worlds-matter.pdf', 'rb')
# creating a pdf reader object
pdfReader = PdfFileReader(pdfObject)
# Extract and concatenate each page's content
text=''
from tabula import read_pdf
from tabulate import tabulate
import pandas as pd
import io
# Read the only the page n°6 of the file
food_calories = read_pdf('./data/food_calories.pdf',pages = 6,
multiple_tables = True, stream = True)
# Transform the result into a string table format
import tabula
tabula.convert_into('./data/food_calories.pdf', "./data/food_calories2.csv",
output_format="csv", pages = 6)