Skip to content

Instantly share code, notes, and snippets.

View James-McNeill's full-sized avatar
🏠
Working from home

James_McNeill James-McNeill

🏠
Working from home
  • Dublin, Ireland
View GitHub Profile
-- select distinct values from column county
SELECT DISTINCT county as county
FROM hrly_Irish_weather;
-- Use union to combine the results from two or more select statements (only distinct values)
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
UNION
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Galway"
GROUP BY county, station;
-- Order the results by column ascending
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
ORDER BY max_temp;
-- Order the results by column descending
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
-- Using limit to select the top result
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
WHERE county = "Cork"
GROUP BY county, station
ORDER BY max_temp DESC
LIMIT 1;
-- Using limit to select the top result, adding an offset to start after the second line
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp
FROM hrly_Irish_weather
-- WINDOW functions for row_number and rank
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp,
row_number() OVER win as row_number,
rank() over win as rank
FROM hrly_Irish_weather
GROUP BY county, station
WINDOW win as (ORDER BY max(temp));
# Import the datasets
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
# Switch on setting to allow all outputs to be displayed
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Perform EDA on the train
train.head()
train.shape
# Extract insights from the excerpt variable using spacy
import spacy
# Initialise spacy with english as the language
nlp = spacy.load('en_core_web_sm')
# Perform initial test on the first excerpt
sample1 = train.loc[0, 'excerpt']
# Create the spacy doc item for review
# Reviewing the token, lemma and stopword for each token (item)
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-"*40)
# Review the first 20 values to test the output
for token in doc[:20]:
print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{len(token)}")
# A few different options for stopwords, spacy and nltk. Lets compare
import nltk
from nltk.corpus import stopwords
# Comparison of the stop words available
print(f"NLTK : {len(stopwords.words('english'))} \n {stopwords.words('english')}")
print(f"Spacy : {len(nlp.Defaults.stop_words)} \n {nlp.Defaults.stop_words}")
# Compare the differences
nltk_set = set(stopwords.words('english'))
# Expanding named entities
for entity in doc.ents:
print(entity.text, entity.label_)
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
print("Number of sentences", len([*doc.sents]))
print("Sentiment", doc.sentiment)