ryanswanstrom/JobAnalysis.java

## JobAnalysis.java
package com.swgoof.datascience.job;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

/**
 * This file is used to count the number of times a word appears in a file.
 * The class reads a file of data scientist job descriptions,
 * each separated by '--'
 * It also keeps track of whether a word occurs in each job description.
 */
public class JobAnalysis {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws FileNotFoundException, IOException {
        BufferedReader in = new BufferedReader(new FileReader("datasciencejobs.txt"));
        int numOfJobs = 16;
        int jobCounter = 0;
        Map<String, WordObj> words = new HashMap<String, WordObj>(150);
        String line = in.readLine();
        do {
            if (line.startsWith("--")) {
                jobCounter++;
            }
            line = line.toLowerCase();
            // replace all dots with nothing
            line = line.replaceAll("\\.", "");
            // replace all special chars with space
            line = line.replaceAll("[^A-Za-z0-9]", " ");
            //System.out.println(line);
            StringTokenizer sp = new StringTokenizer(line);
            while (sp.hasMoreTokens()) {
                String word = sp.nextToken();
                //System.out.println("word:: " + word);
                WordObj tmp = words.get(word);
                if (tmp != null) {
                    //System.out.println("found: " + word);
                    // inc counter
                    tmp.increment(jobCounter);
                    words.put(word, tmp);
                } else {
                    WordObj wordObj = new WordObj(numOfJobs);
                    wordObj.flagJob(jobCounter);
                    words.put(word, wordObj);
                }
            }
            line = in.readLine();
        } while (line != null);
        in.close();

        //Map<String, Integer> sortedWords = sortForMap(words);
        List<Map.Entry<String, WordObj>> sortedWords = sortForListForWordObj(words);
        for (Map.Entry<String, WordObj> entry : sortedWords) {
            System.out.printf("'%s' %s \n", entry.getKey(), entry.getValue());
        }
        System.out.println("total words: " + sortedWords.size());
    }

    /**
     * This method will return a sorted List of WordObjs.
     *
     * @param unsortMap
     * @return
     */
    private static List<Map.Entry<String, WordObj>> sortForListForWordObj(Map<String, WordObj> unsortMap) {

        List<Map.Entry<String, WordObj>> sortedList = new ArrayList(unsortMap.entrySet());

        //sort list based on comparator
        Collections.sort(sortedList, new Comparator() {

            @Override
            public int compare(Object o1, Object o2) {
                return ((Comparable) ((WordObj)((Map.Entry) (o1)).getValue()).getNum()).compareTo(((WordObj)((Map.Entry) (o2)).getValue()).getNum());
            }
        });

        return sortedList;
    }
}

/**
 * A class to keep track of how many times a word occurs and which
 * job posts it appears in.
 */
class WordObj {

    /** number of times the word occurs */
    private Integer num;
    private Boolean[] inJobs;  // a true means the word is in that job

    public WordObj(int numOfJobs) {
        this.num = 1;
        this.inJobs = new Boolean[numOfJobs];
    }

    public void increment(int jobNum) {
        this.flagJob(jobNum);
        num++;
    }

    public void flagJob(int jobNum) {
        if (jobNum >= 0 && jobNum < inJobs.length) {
            this.inJobs[jobNum] = Boolean.TRUE;
        }
    }

    @Override
    public String toString() {
        int counter = 0;
        for (Boolean b : inJobs) {
            if (b != null && b) {
                counter++;
            }
        }
        return String.format(" occured %d times and in %d job descriptions" , getNum(), counter);
    }

    /**
     * @return the num
     */
    public Integer getNum() {
        return num;
    }
}
	package com.swgoof.datascience.job;

	import java.io.BufferedReader;
	import java.io.FileNotFoundException;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.StringTokenizer;

	/**
	* This file is used to count the number of times a word appears in a file.
	* The class reads a file of data scientist job descriptions,
	* each separated by '--'
	* It also keeps track of whether a word occurs in each job description.
	*/
	public class JobAnalysis {

	/**
	* @param args the command line arguments
	*/
	public static void main(String[] args) throws FileNotFoundException, IOException {
	BufferedReader in = new BufferedReader(new FileReader("datasciencejobs.txt"));
	int numOfJobs = 16;
	int jobCounter = 0;
	Map<String, WordObj> words = new HashMap<String, WordObj>(150);
	String line = in.readLine();
	do {
	if (line.startsWith("--")) {
	jobCounter++;
	}
	line = line.toLowerCase();
	// replace all dots with nothing
	line = line.replaceAll("\\.", "");
	// replace all special chars with space
	line = line.replaceAll("[^A-Za-z0-9]", " ");
	//System.out.println(line);
	StringTokenizer sp = new StringTokenizer(line);
	while (sp.hasMoreTokens()) {
	String word = sp.nextToken();
	//System.out.println("word:: " + word);
	WordObj tmp = words.get(word);
	if (tmp != null) {
	//System.out.println("found: " + word);
	// inc counter
	tmp.increment(jobCounter);
	words.put(word, tmp);
	} else {
	WordObj wordObj = new WordObj(numOfJobs);
	wordObj.flagJob(jobCounter);
	words.put(word, wordObj);
	}
	}
	line = in.readLine();
	} while (line != null);
	in.close();

	//Map<String, Integer> sortedWords = sortForMap(words);
	List<Map.Entry<String, WordObj>> sortedWords = sortForListForWordObj(words);
	for (Map.Entry<String, WordObj> entry : sortedWords) {
	System.out.printf("'%s' %s \n", entry.getKey(), entry.getValue());
	}
	System.out.println("total words: " + sortedWords.size());
	}

	/**
	* This method will return a sorted List of WordObjs.
	*
	* @param unsortMap
	* @return
	*/
	private static List<Map.Entry<String, WordObj>> sortForListForWordObj(Map<String, WordObj> unsortMap) {

	List<Map.Entry<String, WordObj>> sortedList = new ArrayList(unsortMap.entrySet());

	//sort list based on comparator
	Collections.sort(sortedList, new Comparator() {

	@Override
	public int compare(Object o1, Object o2) {
	return ((Comparable) ((WordObj)((Map.Entry) (o1)).getValue()).getNum()).compareTo(((WordObj)((Map.Entry) (o2)).getValue()).getNum());
	}
	});

	return sortedList;
	}
	}

	/**
	* A class to keep track of how many times a word occurs and which
	* job posts it appears in.
	*/
	class WordObj {

	/** number of times the word occurs */
	private Integer num;
	private Boolean[] inJobs; // a true means the word is in that job

	public WordObj(int numOfJobs) {
	this.num = 1;
	this.inJobs = new Boolean[numOfJobs];
	}

	public void increment(int jobNum) {
	this.flagJob(jobNum);
	num++;
	}

	public void flagJob(int jobNum) {
	if (jobNum >= 0 && jobNum < inJobs.length) {
	this.inJobs[jobNum] = Boolean.TRUE;
	}
	}

	@Override
	public String toString() {
	int counter = 0;
	for (Boolean b : inJobs) {
	if (b != null && b) {
	counter++;
	}
	}
	return String.format(" occured %d times and in %d job descriptions" , getNum(), counter);
	}

	/**
	* @return the num
	*/
	public Integer getNum() {
	return num;
	}
	}