Stefan Thoss stefanthoss

## cik_dict.py
import re
import requests

DEFAULT_TICKERS = ["BBRY", "VOD", "T", "S"]
URL = "http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany"
CIK_RE = re.compile(r".*CIK=(\d{10}).*")

cik_dict = {}
for ticker in DEFAULT_TICKERS:
    results = CIK_RE.findall(requests.get(URL.format(ticker)).content.decode("ascii"))

## mysql-pandas-import.py
import pandas as pd
import pymysql
from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://USER:PASSWORD@HOST:PORT/DBNAME")

df = pd.read_sql_query("SELECT * FROM table", engine)
df.head()

## git_backup_script.sh
#!/bin/bash

cd /path/to/git/repo/
git add -A
git commit -m "Backup on `date`"
git push origin

## postgres-csv-import.py
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine

engine = create_engine('postgresql+psycopg2://USER:PASSWORD@HOST:PORT/DBNAME')

df = pd.read_csv('local-file.csv', sep=',').replace(to_replace='null', value=np.NaN)
df.to_sql('dbtable', engine, schema='dbschema', if_exists='replace')

## export-postgresql-table.sh
psql -h hostname -U username -W -d database -t -A -F "," -c "SELECT * FROM table" > file.csv

# Explanation of the used options:
# -h Specifies the host name of the machine on which the server is running.
# -U Connect to the database as a specific user.
# -W Force psql to prompt for a password before connecting to a database.
# -d Specifies the name of the database to connect to.
# -t Turn off printing of column names and result row count footers, etc.
# -A Switches to unaligned output mode.
# -F Use separator as the field separator for unaligned output.

## export-pyspark-schema-to-json.py
import json
from pyspark.sql.types import *

# Define the schema
schema = StructType(
    [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
)

# Write the schema
with open("schema.json", "w") as f:

## pbzip2.Dockerfile
FROM alpine:3.10

RUN apk add --no-cache \
  bzip2-dev \
  g++ \
  make

RUN cd /tmp/ && \
  wget -q https://launchpad.net/pbzip2/1.1/1.1.13/+download/pbzip2-1.1.13.tar.gz && \
  tar -xzf pbzip2-1.1.13.tar.gz && \

## advanced-dataframe-union.py
def advanced_dataframe_union(df1, df2):
    df1_fields = set((f.name, f.dataType) for f in df1.schema)
    df2_fields = set((f.name, f.dataType) for f in df2.schema)

    df2 = df2.select(
        df2.columns
        + [
            F.lit(None).cast(datatype).alias(name)
            for name, datatype in df1_fields.difference(df2_fields)
        ]

## find-postgres-tables-by-column.sql
SELECT
  t.table_schema,
  t.table_name,
  c.column_name
FROM
  information_schema.tables t
  INNER JOIN information_schema.columns c ON c.table_name = t.table_name
  AND c.table_schema = t.table_schema
WHERE
  c.column_name ~ 'regex'

## search-pandas-columns-for-string.py
import pandas as pd

# Input:
#            a     b        c    d    e
# 0  Text       Text  NaN      0.0  5
# 1  Text       NaN   1.1.1.1  0.0  55
# 2  Text.Text  Text  Text     0.4  555

data = [
    {"a": "Text", "b": "Text", "d": 0, "e": 5},
	import re
	import requests

	DEFAULT_TICKERS = ["BBRY", "VOD", "T", "S"]
	URL = "http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany"
	CIK_RE = re.compile(r".CIK=(\d{10}).")

	cik_dict = {}
	for ticker in DEFAULT_TICKERS:
	results = CIK_RE.findall(requests.get(URL.format(ticker)).content.decode("ascii"))
	import pandas as pd
	import pymysql
	from sqlalchemy import create_engine

	engine = create_engine("mysql+pymysql://USER:PASSWORD@HOST:PORT/DBNAME")

	df = pd.read_sql_query("SELECT * FROM table", engine)
	df.head()
	#!/bin/bash

	cd /path/to/git/repo/
	git add -A
	git commit -m "Backup on `date`"
	git push origin
	import pandas as pd
	import numpy as np
	import psycopg2
	from sqlalchemy import create_engine

	engine = create_engine('postgresql+psycopg2://USER:PASSWORD@HOST:PORT/DBNAME')

	df = pd.read_csv('local-file.csv', sep=',').replace(to_replace='null', value=np.NaN)
	df.to_sql('dbtable', engine, schema='dbschema', if_exists='replace')
	psql -h hostname -U username -W -d database -t -A -F "," -c "SELECT * FROM table" > file.csv

	# Explanation of the used options:
	# -h Specifies the host name of the machine on which the server is running.
	# -U Connect to the database as a specific user.
	# -W Force psql to prompt for a password before connecting to a database.
	# -d Specifies the name of the database to connect to.
	# -t Turn off printing of column names and result row count footers, etc.
	# -A Switches to unaligned output mode.
	# -F Use separator as the field separator for unaligned output.
	import json
	from pyspark.sql.types import *

	# Define the schema
	schema = StructType(
	[StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
	)

	# Write the schema
	with open("schema.json", "w") as f:
	FROM alpine:3.10

	RUN apk add --no-cache \
	bzip2-dev \
	g++ \
	make

	RUN cd /tmp/ && \
	wget -q https://launchpad.net/pbzip2/1.1/1.1.13/+download/pbzip2-1.1.13.tar.gz && \
	tar -xzf pbzip2-1.1.13.tar.gz && \
	def advanced_dataframe_union(df1, df2):
	df1_fields = set((f.name, f.dataType) for f in df1.schema)
	df2_fields = set((f.name, f.dataType) for f in df2.schema)

	df2 = df2.select(
	df2.columns
	+ [
	F.lit(None).cast(datatype).alias(name)
	for name, datatype in df1_fields.difference(df2_fields)
	]
	SELECT
	t.table_schema,
	t.table_name,
	c.column_name
	FROM
	information_schema.tables t
	INNER JOIN information_schema.columns c ON c.table_name = t.table_name
	AND c.table_schema = t.table_schema
	WHERE
	c.column_name ~ 'regex'
	import pandas as pd

	# Input:
	# a b c d e
	# 0 Text Text NaN 0.0 5
	# 1 Text NaN 1.1.1.1 0.0 55
	# 2 Text.Text Text Text 0.4 555

	data = [
	{"a": "Text", "b": "Text", "d": 0, "e": 5},