Skip to content

Instantly share code, notes, and snippets.

@mturilin
Last active September 1, 2019 20:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mturilin/c28f706a0250fd722323 to your computer and use it in GitHub Desktop.
Save mturilin/c28f706a0250fd722323 to your computer and use it in GitHub Desktop.
Word Count
Get text file from URL, count all the words, and print top 20 words with the number of times they appear in the file.
URL: http://www.gutenberg.org/files/5200/5200.txt
(please copy this *exact* URL into your code)
Sample output:
the - 34
a - 12
hello - 5
dog - 1
Additional info:
- You can use any documentation available online
- You can only use the standard library of your programming language
- Total time for the task is 60 minutes including the instruction
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
namespace WordFreq
{
class Program
{
static void Main(string[] args)
{
string page;
using (var client = new WebClient())
{
page = client.DownloadString("http://www.gutenberg.org/files/5200/5200.txt");
}
var words = page.Split(new char[] {' ', '\n', '!', '"', '#', '%', '&', '(', ')', '*',',',
'-','.','/',':',';','?','@','[', ']','_','{','}'});
Dictionary<string, int> word_freq = new Dictionary<string, int>();
foreach (string w in words)
{
var wl = w.Trim().ToLower();
if (wl == "") continue;
int freq = word_freq.TryGetValue(wl, out freq) ? freq : 0;
word_freq[wl] = freq + 1;
}
List<KeyValuePair<string, int>> word_freq_list = word_freq.ToList();
word_freq_list.Sort((pair1, pair2) => -pair1.Value.CompareTo(pair2.Value)) ;
for (int i = 0; i < 20; ++i)
{
Console.WriteLine($"{word_freq_list[i].Key} - {word_freq_list[i].Value}");
}
}
}
}
#!/usr/bin/python
# Python 2 version
import urllib2
from collections import defaultdict
import string
text = urllib2.urlopen("http://www.gutenberg.org/cache/epub/5200/pg5200.txt").read()
words = text.split()
freq = defaultdict(int)
for word in words:
word = word.lower().strip().strip(string.punctuation)
freq[word] += 1
sorted_words = sorted(freq.iteritems(), key=lambda x: x[1], reverse=True)
for i in range(min(len(sorted_words), 20)):
print sorted_words[i][0], '-', sorted_words[i][1]
#!/usr/bin/python3
# Python 3 version
from urllib import request
from collections import defaultdict
import string
text_bin = request.urlopen("http://www.gutenberg.org/cache/epub/5200/pg5200.txt").read()
text = str(text_bin, 'utf8')
words = text.split()
freq = defaultdict(int)
for word in words:
word = word.lower().strip().strip(string.punctuation)
freq[word] += 1
sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
for i in range(min(len(sorted_words), 20)):
print(sorted_words[i][0], '-', sorted_words[i][1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment