This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- select distinct values from column county | |
SELECT DISTINCT county as county | |
FROM hrly_Irish_weather; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Use union to combine the results from two or more select statements (only distinct values) | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather | |
WHERE county = "Cork" | |
GROUP BY county, station | |
UNION | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather | |
WHERE county = "Galway" | |
GROUP BY county, station; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Order the results by column ascending | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather | |
WHERE county = "Cork" | |
GROUP BY county, station | |
ORDER BY max_temp; | |
-- Order the results by column descending | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather | |
WHERE county = "Cork" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Using limit to select the top result | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather | |
WHERE county = "Cork" | |
GROUP BY county, station | |
ORDER BY max_temp DESC | |
LIMIT 1; | |
-- Using limit to select the top result, adding an offset to start after the second line | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp | |
FROM hrly_Irish_weather |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- WINDOW functions for row_number and rank | |
SELECT county, station, max(temp) as max_temp, min(temp) as min_temp, | |
row_number() OVER win as row_number, | |
rank() over win as rank | |
FROM hrly_Irish_weather | |
GROUP BY county, station | |
WINDOW win as (ORDER BY max(temp)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import the datasets | |
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv') | |
# Switch on setting to allow all outputs to be displayed | |
from IPython.core.interactiveshell import InteractiveShell | |
InteractiveShell.ast_node_interactivity = "all" | |
# Perform EDA on the train | |
train.head() | |
train.shape |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract insights from the excerpt variable using spacy | |
import spacy | |
# Initialise spacy with english as the language | |
nlp = spacy.load('en_core_web_sm') | |
# Perform initial test on the first excerpt | |
sample1 = train.loc[0, 'excerpt'] | |
# Create the spacy doc item for review |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reviewing the token, lemma and stopword for each token (item) | |
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword')) | |
print("-"*40) | |
# Review the first 20 values to test the output | |
for token in doc[:20]: | |
print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{len(token)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A few different options for stopwords, spacy and nltk. Lets compare | |
import nltk | |
from nltk.corpus import stopwords | |
# Comparison of the stop words available | |
print(f"NLTK : {len(stopwords.words('english'))} \n {stopwords.words('english')}") | |
print(f"Spacy : {len(nlp.Defaults.stop_words)} \n {nlp.Defaults.stop_words}") | |
# Compare the differences | |
nltk_set = set(stopwords.words('english')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Expanding named entities | |
for entity in doc.ents: | |
print(entity.text, entity.label_) | |
# Analyze syntax | |
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks]) | |
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"]) | |
print("Number of sentences", len([*doc.sents])) | |
print("Sentiment", doc.sentiment) |
OlderNewer