Created
July 17, 2016 11:01
-
-
Save koseki/3f0546ab4753c79538f296a601565e4d to your computer and use it in GitHub Desktop.
Plot sat 5000 words frequency rank
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
require 'open-uri' | |
unless File.exist?('sat5000.html') | |
open('http://www.freevocabulary.com/') do |io| | |
open('sat5000.html', 'w') do |out| | |
out.puts(io.read) | |
end | |
end | |
end | |
source = File.read('sat5000.html') | |
words = source[/(abase.+zodiac)/m, 1].split(/<br\s*\/?>/).map {|line| line[/([^\s]+)/, 1] } | |
unless File.exist?('google-20k.txt') | |
open('https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt') do |io| | |
open('google-20k.txt', 'w') do |out| | |
out.puts(io.read) | |
end | |
end | |
end | |
unless File.exist?('freq.txt') | |
freq = File.readlines('google-20k.txt').map {|w| w.chomp } | |
not_found_rank = freq.length + 1 | |
map = {} | |
freq.each.with_index do |word, i| | |
map[word] = i + 1 | |
end | |
words_freq = [] | |
words.each.with_index do |w, i| | |
if map[w] | |
words_freq << [map[w], w] | |
end | |
end | |
words_freq = words_freq.sort {|a,b| a[0] <=> b[0] } | |
open('freq.txt', 'w') do |out| | |
out.puts words_freq.map {|w| "#{w[0]}\t#{w[1]}"} | |
end | |
end | |
plot = %{set terminal png; set datafile separator "\\t"; set output "freq.png"; plot "freq.txt";} | |
%x{gnuplot -e '#{plot}'} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm wondering why the point
(500, 10,000)
happen. It seems too sharp.