-
-
Save ceshine/0a33de090ea66c6074d8719ead07ced1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### A Pluto.jl notebook ### | |
# v0.14.2 | |
using Markdown | |
using InteractiveUtils | |
# ╔═╡ b5ad061a-a297-11eb-0308-91355c5c6d15 | |
using DataFrames, CSV, WordTokenizers, Plots, Random, Statistics | |
# ╔═╡ 219ff84e-1bb6-4650-b9bc-c1e22a18aff1 | |
md"## Read and Transform the Data | |
Data source: [Shopee - Price Match Guarantee](https://www.kaggle.com/c/shopee-product-matching/data) | |
" | |
# ╔═╡ 6c5e5252-0d4c-4422-ada4-377dbf87e9db | |
df = DataFrame(CSV.File("../data/train.csv"; select=[:title, :label_group])) | |
# ╔═╡ e342798e-e079-4cea-a0f3-482ac0f37c2d | |
md"Make all characters lower cased:" | |
# ╔═╡ ee90d5e9-e978-4ad2-a82b-ae4fc0a1abe7 | |
df.title = map(x -> lowercase(x), df.title); | |
# ╔═╡ 2f6784e0-a1e8-4f77-b8b9-34f23797cc17 | |
md"Group by label_group:" | |
# ╔═╡ 3d04e1cf-dbbf-479d-81dc-fd5d5e6a9b8c | |
groups = groupby(df, "label_group") | |
# ╔═╡ 0e66e630-4ed6-4b7d-938b-ed1a1c14daad | |
md"## Tokenizer demo | |
List of tokenize functions demonstrated here (from the `WordTokenizers` package): | |
1. `punctuation_space_tokenize` | |
2. `penn_tokenize` | |
3. `nltk_word_tokenize` | |
4. `poormans_tokenize` | |
" | |
# ╔═╡ 1598b104-0e27-45cc-8306-e8b68159362b | |
punctuation_space_tokenize("sprei lady rose 180x200 king terlaris keroppi") | |
# ╔═╡ e8032f5f-10b8-42f9-b072-07440d06f6f3 | |
penn_tokenize("sprei lady rose 180x200 king terlaris keroppi") | |
# ╔═╡ 5bbc4ba9-9ed0-4c22-b934-7ac470ba6e12 | |
punctuation_space_tokenize("sprei lady rose 180x200 king terlaris keroppi") | |
# ╔═╡ 3e14d636-ed5c-4a4a-9ecf-0a9bd463966d | |
nltk_word_tokenize("sprei lady rose 180x200 king terlaris keroppi") | |
# ╔═╡ c83c778f-baa8-4d2b-a8a9-70411eb64867 | |
punctuation_space_tokenize("double tape 3m vhb 12 mm x 4,5 m original / double foam tape") | |
# ╔═╡ abd84ebb-a419-4bfa-856f-3a328251c506 | |
poormans_tokenize("double tape 3m vhb 12 mm x 4,5 m original / double foam tape") | |
# ╔═╡ 1b64053e-375b-44a7-a037-d783f1e0c3a5 | |
penn_tokenize("double tape 3m vhb 12 mm x 4,5 m original / double foam tape") | |
# ╔═╡ 6277476d-64a3-484f-bdf4-df1187d97412 | |
nltk_word_tokenize("double tape 3m vhb 12 mm x 4,5 m original / double foam tape") | |
# ╔═╡ ecb1c961-6669-413b-b514-49aaa9743422 | |
md"## Tokenize and Count" | |
# ╔═╡ d1e10f59-71e3-4326-87c3-50dc1dc9a729 | |
function tokenize_and_count(groups::GroupedDataFrame, tokenize_func::Function) | |
df_combo = DataFrame( | |
text_1 = String[], text_2 = String[], label_group = Int64[], | |
n_1 = Int64[], n_2 = Int64[], intersect = Int64[], union = Int64[] | |
) | |
for group in groups | |
tokens = map(x -> Set(tokenize_func(x)), group[!, :title]) | |
for i in 1:(nrow(group)-1) | |
for j in (i+1):nrow(group) | |
push!( | |
df_combo, | |
( | |
group[i, :title], group[j, :title], group[i, :label_group], | |
length(tokens[i]), length(tokens[j]), | |
length(∩(tokens[i], tokens[j])), | |
length(∪(tokens[i], tokens[j])) | |
) | |
) | |
end | |
end | |
end | |
df_combo.jaccard = df_combo.intersect ./ ( | |
df_combo.n_1 + df_combo.n_2 - df_combo.intersect) | |
df_combo.overlap = df_combo.intersect ./ min.( | |
df_combo.n_1, df_combo.n_2) | |
return df_combo | |
end | |
# ╔═╡ d8cb00a0-d3a6-42b2-b2d7-9fd9631b88e2 | |
md"Compare results from the two tokenizers (nltk and penn):" | |
# ╔═╡ 8db610b0-f606-42d1-8e06-91f4b9e45a01 | |
df_nltk = tokenize_and_count(groups, nltk_word_tokenize); df_nltk[1:3, :] | |
# ╔═╡ 3c0f90fe-512a-470d-8c1d-dbdfb062b4f8 | |
df_penn = tokenize_and_count(groups, penn_tokenize); df_penn[1:3, :] | |
# ╔═╡ 841a2b3b-d353-4e12-906f-c3cfa3c180dd | |
md"## Histogram" | |
# ╔═╡ 00c78e55-ce31-41e9-acc6-f5fff37f569f | |
begin | |
histogram(df_penn.jaccard, label="jac") | |
title!("Jaccard Distance (Penn)") | |
end | |
# ╔═╡ d325f3ef-8bf0-4e39-aff8-3289d4956bd9 | |
begin | |
histogram(df_penn.overlap .* 100, label="overlap") | |
title!("Overlap(%) (Penn)") | |
end | |
# ╔═╡ ea0eee30-213d-468c-a380-9062871e061f | |
begin | |
histogram(df_nltk.jaccard, label="jac") | |
title!("Jaccard Distance (NLTK)") | |
end | |
# ╔═╡ 5702777b-e3be-4c54-8f5c-f230f9e5d312 | |
begin | |
histogram(df_nltk.overlap .* 100, bins=20, label="overlap") | |
title!("Overlap(%) (NLTK)") | |
end | |
# ╔═╡ 70e7a1d7-75af-4293-b17f-66e371f51bcf | |
begin | |
histogram( | |
combine( | |
groupby(df_nltk, :label_group), | |
:overlap => mean | |
).overlap_mean .* 100, bins=20, label="overlap") | |
title!("Mean Overlap(%) in Group (NLTK)") | |
end | |
# ╔═╡ f724ebaa-5ee1-4de8-bc52-a7105f8b51e1 | |
begin | |
histogram( | |
combine( | |
groupby(df_nltk, :label_group), | |
:overlap => (x -> min(x...)) => :min | |
).min .* 100, bins=20, label="overlap") | |
title!("Min Overlap(%) in Group (NLTK)") | |
end | |
# ╔═╡ cb7af49b-6d06-435a-a616-57da1d3ac7c6 | |
begin | |
histogram( | |
combine( | |
groupby(df_nltk, :label_group), | |
:overlap => (x -> max(x...)) => :max | |
).max .* 100, bins=20, label="overlap") | |
title!("Max Overlap(%) in Group (NLTK)") | |
end | |
# ╔═╡ b9f20397-d9a8-4fa4-ba48-87b820abb5ad | |
begin | |
histogram( | |
combine( | |
groupby(df, :label_group), | |
nrow | |
).nrow, bins=50, label="n") | |
title!("Label Group Sizes") | |
end | |
# ╔═╡ 7a77465e-aece-4472-b59c-a439c7b8fc1e | |
md"(# of groups with fewer than ten members, # of groups with ten or more members):" | |
# ╔═╡ be867d59-86c8-4b94-93c6-94a88c7628c3 | |
begin | |
df_group_size = combine( | |
groupby(df, :label_group), | |
nrow | |
) | |
( | |
nrow(filter(:nrow => <(10), df_group_size)), | |
nrow(filter(:nrow => >=(10), df_group_size)) | |
) | |
end | |
# ╔═╡ ade6a594-15b4-4c97-a76b-60ce3da85be3 | |
md"Larger groups:" | |
# ╔═╡ 486ebf45-f33b-4ce9-85e6-c88338a14577 | |
filter( | |
:nrow => ==(max(df_group_size.nrow...)), | |
df_group_size | |
) | |
# ╔═╡ c7cd4170-64d2-47c0-af5f-99486dee8c72 | |
md"Pick one group as an example:" | |
# ╔═╡ 844405b4-ff74-4d7f-9c91-1873f3a88e9e | |
filter(:label_group => ==(3627744656), df) | |
# ╔═╡ 4fbf3f5b-8842-476f-a940-1f9ae8aed69f | |
md"## Samples" | |
# ╔═╡ 8047da13-9795-47ed-ae4d-4447aef085b4 | |
function sample(df::DataFrame, cond, n::Int64) | |
df_tmp = filter(cond, df) | |
(nrow(df_tmp), df_tmp[shuffle(1:nrow(df_tmp))[1:n], :]) | |
end | |
# ╔═╡ 6a6061c6-2b25-4395-816a-08844739e120 | |
md"Jaccard index equals 1 (exactly the same):" | |
# ╔═╡ 964e971c-5e0b-4814-b7c8-ce72ef2be3d3 | |
sample(df_nltk, :jaccard => x -> x == 1, 10) | |
# ╔═╡ b76a355d-e5c4-4529-89b0-81fbdd61e86f | |
md"Jaccard index equals 0 (completely different):" | |
# ╔═╡ 9bf1369d-45f2-420a-95ba-2462cab27631 | |
sample(df_nltk, :jaccard => x -> x == 0, 10) | |
# ╔═╡ 47b2d64f-9a5a-4015-91c6-9294fd450126 | |
md"Jaccard index between 0 and 0.2 (only slightly similar):" | |
# ╔═╡ f09c2239-5617-4c60-a774-572342f35e6e | |
sample(df_nltk, :jaccard => x -> (x > 0) & (x < 0.2), 10) | |
# ╔═╡ Cell order: | |
# ╠═b5ad061a-a297-11eb-0308-91355c5c6d15 | |
# ╟─219ff84e-1bb6-4650-b9bc-c1e22a18aff1 | |
# ╠═6c5e5252-0d4c-4422-ada4-377dbf87e9db | |
# ╟─e342798e-e079-4cea-a0f3-482ac0f37c2d | |
# ╠═ee90d5e9-e978-4ad2-a82b-ae4fc0a1abe7 | |
# ╟─2f6784e0-a1e8-4f77-b8b9-34f23797cc17 | |
# ╠═3d04e1cf-dbbf-479d-81dc-fd5d5e6a9b8c | |
# ╠═0e66e630-4ed6-4b7d-938b-ed1a1c14daad | |
# ╠═1598b104-0e27-45cc-8306-e8b68159362b | |
# ╠═e8032f5f-10b8-42f9-b072-07440d06f6f3 | |
# ╠═5bbc4ba9-9ed0-4c22-b934-7ac470ba6e12 | |
# ╠═3e14d636-ed5c-4a4a-9ecf-0a9bd463966d | |
# ╠═c83c778f-baa8-4d2b-a8a9-70411eb64867 | |
# ╠═abd84ebb-a419-4bfa-856f-3a328251c506 | |
# ╠═1b64053e-375b-44a7-a037-d783f1e0c3a5 | |
# ╠═6277476d-64a3-484f-bdf4-df1187d97412 | |
# ╟─ecb1c961-6669-413b-b514-49aaa9743422 | |
# ╠═d1e10f59-71e3-4326-87c3-50dc1dc9a729 | |
# ╠═d8cb00a0-d3a6-42b2-b2d7-9fd9631b88e2 | |
# ╠═8db610b0-f606-42d1-8e06-91f4b9e45a01 | |
# ╠═3c0f90fe-512a-470d-8c1d-dbdfb062b4f8 | |
# ╟─841a2b3b-d353-4e12-906f-c3cfa3c180dd | |
# ╟─00c78e55-ce31-41e9-acc6-f5fff37f569f | |
# ╟─d325f3ef-8bf0-4e39-aff8-3289d4956bd9 | |
# ╟─ea0eee30-213d-468c-a380-9062871e061f | |
# ╟─5702777b-e3be-4c54-8f5c-f230f9e5d312 | |
# ╟─70e7a1d7-75af-4293-b17f-66e371f51bcf | |
# ╟─f724ebaa-5ee1-4de8-bc52-a7105f8b51e1 | |
# ╟─cb7af49b-6d06-435a-a616-57da1d3ac7c6 | |
# ╟─b9f20397-d9a8-4fa4-ba48-87b820abb5ad | |
# ╟─7a77465e-aece-4472-b59c-a439c7b8fc1e | |
# ╟─be867d59-86c8-4b94-93c6-94a88c7628c3 | |
# ╟─ade6a594-15b4-4c97-a76b-60ce3da85be3 | |
# ╟─486ebf45-f33b-4ce9-85e6-c88338a14577 | |
# ╟─c7cd4170-64d2-47c0-af5f-99486dee8c72 | |
# ╠═844405b4-ff74-4d7f-9c91-1873f3a88e9e | |
# ╟─4fbf3f5b-8842-476f-a940-1f9ae8aed69f | |
# ╠═8047da13-9795-47ed-ae4d-4447aef085b4 | |
# ╟─6a6061c6-2b25-4395-816a-08844739e120 | |
# ╠═964e971c-5e0b-4814-b7c8-ce72ef2be3d3 | |
# ╟─b76a355d-e5c4-4529-89b0-81fbdd61e86f | |
# ╠═9bf1369d-45f2-420a-95ba-2462cab27631 | |
# ╟─47b2d64f-9a5a-4015-91c6-9294fd450126 | |
# ╠═f09c2239-5617-4c60-a774-572342f35e6e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment