Skip to content

Instantly share code, notes, and snippets.

Paul Meinshausen PMeinshausen

View GitHub Profile
View chiangMaiCrawler.py
from selenium import webdriver
browser = webdriver.Firefox()
# The list of book titles is in a pandas dataframe
# named 'data', in the column 'title'
for row in range(len(data)):
browser.get("http://www.amazon.com")
time.sleep(1)
View TF_IDF.sql
--Syntax
SELECT * FROM TF_IDF(
ON TF
(
ON {table|view|(query)} PARTITION BY docid
[FORMULA('bool'|'log'|'augment'|'normal')]
) AS TF PARTITION BY term
[ON (SELECT term, COUNT(distinct docid) FROM input_table
GROUP BY term) AS docperterm PARTITION BY term]
View Sentenizer.sql
--Syntax (version 1.0)
select * from Sentenizer(
on input_table
TEXTCOLUMN('text_column_name')
[ACCUMULATE('accumulate_column_names')]);
View PoSTagger.sql
--Syntax (version 1.0)
SELECT * from PosTagger(
ON input_table
TEXTCOLUMN('text_column_name')]
[ACCUMULATE('accumulate_column_names')]
);
View nGram.sql
--Syntax (version 1.3)
SELECT *
FROM nGram
(
ON {table_name | view_name | (query)}
TEXT_COLUMN('column_name')
[DELIMITER('delimiter_regular_expression')]
GRAMS(gram_number)
[OVERLAPPING({'true'|'false'})]
[CASE_INSENSITIVE({'true'|'false'})]
View NB_TextClassifier.sql
--Syntax (version 1.0)
CREATE TABLE model_table_name ( PARTITION KEY(token) ) AS
SELECT token, SUM( category_1 ) AS category_1, ... ,
SUM( category_n ) AS category_n FROM
NaiveBayesText(
ON input_table
TEXT_COLUMN( text_column )
CATEGORY_COLUMN( category_column )
CATEGORIES( category_1, ... , category_n )
View Levenshtein.sql
--Syntax (version 1.1)
--Use a SELECT statement to call the Levenshtein distance function:
SELECT *
FROM ldist
(
ON table_name
SOURCE (column1 [, column2,...])
TARGET(column1)
[THRESHOLD(value)]
View WMAVG.sql
--Syntax (version 1.1)
SELECT * FROM WMAVG(
ON {table_name|view_name|(query)}
PARTITION BY partition_column
ORDER BY order_by_column
COLUMNS('column_names')
RETURN_ALL('true'|'false')
WINDOW_SIZE('window_size')
);
View VWAP.sql
--Syntax (version 1.1)
SELECT * FROM VWAP(
ON {table_name | view_name | (query)}
PARTITION BY expression [, ...]
ORDER BY date_column
[PRICE('price_column')]
[VOLUME ('volume_column')]
[TIMEINTERVAL('number_of_seconds')]
[DT('date_column')]
);
View Sample.sql
--Syntax (version 1.0)
--Unconditional sampling, single sample rate
select * from sample(
ON ...
SAMPLEFRACTION('fraction')
[Seed('seed')]
--Unconditional sampling, total approximate sample size
select * from sample (
You can’t perform that action at this time.