Last active
September 1, 2019 20:44
-
-
Save mturilin/c28f706a0250fd722323 to your computer and use it in GitHub Desktop.
Word Count
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Get text file from URL, count all the words, and print top 20 words with the number of times they appear in the file. | |
URL: http://www.gutenberg.org/files/5200/5200.txt | |
(please copy this *exact* URL into your code) | |
Sample output: | |
the - 34 | |
a - 12 | |
hello - 5 | |
… | |
dog - 1 | |
Additional info: | |
- You can use any documentation available online | |
- You can only use the standard library of your programming language | |
- Total time for the task is 60 minutes including the instruction | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Net; | |
namespace WordFreq | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
string page; | |
using (var client = new WebClient()) | |
{ | |
page = client.DownloadString("http://www.gutenberg.org/files/5200/5200.txt"); | |
} | |
var words = page.Split(new char[] {' ', '\n', '!', '"', '#', '%', '&', '(', ')', '*',',', | |
'-','.','/',':',';','?','@','[', ']','_','{','}'}); | |
Dictionary<string, int> word_freq = new Dictionary<string, int>(); | |
foreach (string w in words) | |
{ | |
var wl = w.Trim().ToLower(); | |
if (wl == "") continue; | |
int freq = word_freq.TryGetValue(wl, out freq) ? freq : 0; | |
word_freq[wl] = freq + 1; | |
} | |
List<KeyValuePair<string, int>> word_freq_list = word_freq.ToList(); | |
word_freq_list.Sort((pair1, pair2) => -pair1.Value.CompareTo(pair2.Value)) ; | |
for (int i = 0; i < 20; ++i) | |
{ | |
Console.WriteLine($"{word_freq_list[i].Key} - {word_freq_list[i].Value}"); | |
} | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Python 2 version | |
import urllib2 | |
from collections import defaultdict | |
import string | |
text = urllib2.urlopen("http://www.gutenberg.org/cache/epub/5200/pg5200.txt").read() | |
words = text.split() | |
freq = defaultdict(int) | |
for word in words: | |
word = word.lower().strip().strip(string.punctuation) | |
freq[word] += 1 | |
sorted_words = sorted(freq.iteritems(), key=lambda x: x[1], reverse=True) | |
for i in range(min(len(sorted_words), 20)): | |
print sorted_words[i][0], '-', sorted_words[i][1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Python 3 version | |
from urllib import request | |
from collections import defaultdict | |
import string | |
text_bin = request.urlopen("http://www.gutenberg.org/cache/epub/5200/pg5200.txt").read() | |
text = str(text_bin, 'utf8') | |
words = text.split() | |
freq = defaultdict(int) | |
for word in words: | |
word = word.lower().strip().strip(string.punctuation) | |
freq[word] += 1 | |
sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True) | |
for i in range(min(len(sorted_words), 20)): | |
print(sorted_words[i][0], '-', sorted_words[i][1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment