Skip to content

Instantly share code, notes, and snippets.

View jkmackie's full-sized avatar

jkmackie

View GitHub Profile
def match_regex_patt(df, target_col, regex_patt, no_match_value='None'):
'''Returns regex_patt matches as list. Case is ignored.'''
matchList=[]
cnt = 0
for idx in df.index:
m = re.search(regex_patt, df.loc[idx, target_col], flags=re.IGNORECASE) #re.search(pattern, string) gets first match
if m is not None:
matchList.append(m.group(1).lower())
else:
cnt+=1
########################################################################
# #
# Compare appending performance - DataFrames versus lists #
# #
########################################################################
import time
import pandas as pd
from random import randint
import sys
import sqlite3
def write_table(df, table_name_str):
'''Writes dataframe to sqlite table named table_name_str.'''
with sqlite3.connect('auto.sqlite', isolation_level = None) as conn: # autocommit mode
return df.to_sql(table_name_str, con=conn, if_exists='replace', index=False)
#Save tables to PostgreSQL 11.3 database.
import psycopg2 #version 2.8.4
from sqlalchemy import create_engine #version 1.3.11
from sqlalchemy.dialects.postgresql import JSON, JSONB
#The text at the end, postgres, is the database name.
engine=create_engine("postgresql://postgres@localhost:5432/postgres")
#Write vehicle table with json columns
# -*- coding: utf-8 -*-
import scrapy
#scrape with this terminal command: scrapy crawl clspider -o mycity.json
#scrapy version = 1.60. Shift + Alt + F to format JSON in VS Code.
class ClspiderSpider(scrapy.Spider):
name = 'clspider'
allowed_domains = ['craigslist.org']
start_urls = ['https://elpaso.craigslist.org/search/cta?auto_make_model=ford'] #cta is cars + trucks by ALL
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#Class names and shapes
print('1 train class names:', train_ds.class_names)
print('2 val class names:', val_ds.class_names)
for image_batch, labels_batch in train_ds:
print('3 images, xpixels, ypixels, color_channels:',image_batch.shape)
print('4 labels:', labels_batch.shape)
break
#Preprocess dataset per https://www.tensorflow.org/tutorials/load_data/images
train_ds = train_ds.map(lambda x, y: (preprocess_input(x), y))
val_ds = val_ds.map(lambda x, y: (preprocess_input(x), y))
# Show min/max of first image. Notice the pixel values after preprocess.
image_batch, labels_batch = next(iter(train_ds))
first_image = image_batch[0]
print('image min and max values:', np.min(first_image), np.max(first_image), '\n\n')
# Load and freeze VGG16 model.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
def annotate_w_xy_corr(x, y, **kwargs):
coef = np.corrcoef(x, y)[0][1]
label = r'corr = ' + str(round(coef,3))
ax = plt.gca()
ax.annotate(label, xy = (0.3, .07), xycoords = ax.transAxes, c='darkred') #size = 18
from keras.applications.vgg16 import VGG16
#Create dataset. The file_paths method shows train_ds and val_ds are mutually exclusive.
train_ds = tf.keras.utils.image_dataset_from_directory(
'./images/train/',
labels='inferred',
shuffle=True,
seed=8,
image_size=(224, 224),
batch_size=32)