James_McNeill James-McNeill

## a_DISTINCT.sql
-- select distinct values from column county
SELECT DISTINCT county as county
FROM hrly_Irish_weather;

## b_UNION.sql
-- Use union to combine the results from two or more select statements (only distinct values)
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
UNION
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Galway"
GROUP BY county, station;

## c_ORDER_BY.sql
-- Order the results by column ascending
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
ORDER BY max_temp;
-- Order the results by column descending
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"

## d_LIMIT_OFFSET.sql
-- Using limit to select the top result
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
ORDER BY max_temp DESC
LIMIT 1;
-- Using limit to select the top result, adding an offset to start after the second line
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather

## e_WINDOW.sql
-- WINDOW functions for row_number and rank
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp,
	row_number() OVER win as row_number,
	rank() over win as rank
FROM hrly_Irish_weather
GROUP BY county, station
WINDOW win as (ORDER BY max(temp));

## 01_nlp_initial_EDA.py
# Import the datasets
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

# Switch on setting to allow all outputs to be displayed
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Perform EDA on the train
train.head()
train.shape

## 02_nlp_spacy_insights.py
# Extract insights from the excerpt variable using spacy
import spacy

# Initialise spacy with english as the language
nlp = spacy.load('en_core_web_sm')

# Perform initial test on the first excerpt
sample1 = train.loc[0, 'excerpt']

# Create the spacy doc item for review

## 03_nlp_token_lemma_stopword.py
# Reviewing the token, lemma and stopword for each token (item)
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-"*40)
# Review the first 20 values to test the output
for token in doc[:20]:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{len(token)}")

## 04_review_stopwords.py
# A few different options for stopwords, spacy and nltk. Lets compare
import nltk
from nltk.corpus import stopwords

# Comparison of the stop words available
print(f"NLTK : {len(stopwords.words('english'))} \n {stopwords.words('english')}")
print(f"Spacy : {len(nlp.Defaults.stop_words)} \n {nlp.Defaults.stop_words}")

# Compare the differences
nltk_set = set(stopwords.words('english'))

## 05_nlp_entities_sentences.py
# Expanding named entities
for entity in doc.ents:
    print(entity.text, entity.label_)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
print("Number of sentences", len([*doc.sents]))
print("Sentiment", doc.sentiment)
	-- select distinct values from column county
	SELECT DISTINCT county as county
	FROM hrly_Irish_weather;
	-- Use union to combine the results from two or more select statements (only distinct values)
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	WHERE county = "Cork"
	GROUP BY county, station
	UNION
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	WHERE county = "Galway"
	GROUP BY county, station;
	-- Order the results by column ascending
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	WHERE county = "Cork"
	GROUP BY county, station
	ORDER BY max_temp;
	-- Order the results by column descending
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	WHERE county = "Cork"
	-- Using limit to select the top result
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	WHERE county = "Cork"
	GROUP BY county, station
	ORDER BY max_temp DESC
	LIMIT 1;
	-- Using limit to select the top result, adding an offset to start after the second line
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
	FROM hrly_Irish_weather
	-- WINDOW functions for row_number and rank
	SELECT county, station, max(temp) as max_temp, min(temp) as min_temp,
	row_number() OVER win as row_number,
	rank() over win as rank
	FROM hrly_Irish_weather
	GROUP BY county, station
	WINDOW win as (ORDER BY max(temp));
	# Import the datasets
	train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

	# Switch on setting to allow all outputs to be displayed
	from IPython.core.interactiveshell import InteractiveShell
	InteractiveShell.ast_node_interactivity = "all"

	# Perform EDA on the train
	train.head()
	train.shape
	# Extract insights from the excerpt variable using spacy
	import spacy

	# Initialise spacy with english as the language
	nlp = spacy.load('en_core_web_sm')

	# Perform initial test on the first excerpt
	sample1 = train.loc[0, 'excerpt']

	# Create the spacy doc item for review
	# Reviewing the token, lemma and stopword for each token (item)
	print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
	print("-"*40)
	# Review the first 20 values to test the output
	for token in doc[:20]:
	print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{len(token)}")
	# A few different options for stopwords, spacy and nltk. Lets compare
	import nltk
	from nltk.corpus import stopwords

	# Comparison of the stop words available
	print(f"NLTK : {len(stopwords.words('english'))} \n {stopwords.words('english')}")
	print(f"Spacy : {len(nlp.Defaults.stop_words)} \n {nlp.Defaults.stop_words}")

	# Compare the differences
	nltk_set = set(stopwords.words('english'))
	# Expanding named entities
	for entity in doc.ents:
	print(entity.text, entity.label_)

	# Analyze syntax
	print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
	print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
	print("Number of sentences", len([*doc.sents]))
	print("Sentiment", doc.sentiment)