illy

## Power_of_shell.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                illy
                / Power_of_shell.md
            
            
              Last active
              January 1, 2016 17:09
            
          
    The Power of Shell

-- Using shell programming for corpus analysis
January, 2012
1. My story

I still remember the first supervision meeting with my supervisor.

  
## censorship.bib
%% This BibTeX bibliography file in UTF-8 format was created using Papers.
%% http://mekentosj.com/papers/

@article{Fu:2013p32492,
author = {King-Wa Fu and Chung-hong Chan and Michael Chau},
title = {Assessing Censorship on Microblogs in China: Discriminatory Keyword Analysis and Impact Evaluation of the {\&}amp;{\#} x201C; Real Name Registration{\&}amp;{\#} x201D; Policy},
abstract = {Using 111 million microblogs collected between January 1 and June 30, 2012, we report our findings on investigating the use of microblogs in China, or weibo in Chinese, and the impact of censorship practices imposed on Chinese microbloggers. To better control for alternative explanations for censorship decisions that are attributable to an individual's characteristics and choices, we deployed a matched case-control study design to 1) determine a list of Chinese terms that discriminate the censored and uncensored posts when they are written by the same microbloggers, with the list including some homophones and puns created by

## twitter_ling.bib
%% This BibTeX bibliography file in UTF-8 format was created using Papers.
%% http://mekentosj.com/papers/

@article{Williams:2012p29780,
author = {J Williams},
journal = {Proceedings of the 2012 Student Research Workshop, EMNLP2012},
title = {Extracting fine-grained durations for verbs from Twitter},
abstract = {We seek to automatically estimate typical durations for events and habits described in Twitter tweets. A corpus of more than 14 million tweets containing temporal du- ration information was collected. These tweets were classified as to their habituality status using a bootstrapped, decision tree. For each verb lemma, associated duration information was collected for episodic and habitual uses of the verb. Summary statis- tics for 483 verb lemmas and their typical habit and episode durations has been com- piled and made available. This automati- cally generated duration information is broadly comparable to hand-annotation.},
pages = {49},
year = {2012},

## twitter_lib.bib
%% This BibTeX bibliography file in UTF-8 format was created using Papers.
%% http://mekentosj.com/papers/

@article{turner_praise_2012,
author = {Julia Turner},
journal = {The New York Times},
title = {In Praise of the Hashtag},
chapter = {Magazine},
year = {2012},
keywords = {Twitter},

## dataframeA.tsv
     NSR SR
184579   0  1
184580   0  1
184581   0  1
184582   0  1
184583   0  1
184584   0  1
184585   0  1
184586   0  1
184587   0  1

## price_sample.tsv
  1 Date     Open    High    Low     Close
  2 2012-03-27   20.12   20.14   20.01   20.04
  3 2012-03-28   20.02   21.00   19.76   20.01
  4 2012-03-29   19.82   19.97   19.71   19.95
  5 2012-03-30   20.07   20.13   19.95   20.07
  6 2012-03-31   NA      NA      NA      NA
  7 2012-04-01   NA      NA      NA      NA
  8 2012-04-02   20.03   20.11   19.90   20.02
  9 2012-04-03   19.97   20.02   19.80   19.96
 10 2012-04-04   19.65   19.81   19.62   19.74

## LM_POS.txt
able
abundance
abundant
acclaimed
accomplish
accomplished
accomplishes
accomplishing
accomplishment
accomplishments

## reordering5.r
sample <- read.table("~/Dropbox/sample.txt", header=T, sep="\t")
sample$Type <- factor(sample$Type, levels = c("NSR", "stock-related", "NTR", "ticker-related", "NEG", "NEU", "POS"))

p <- ggplot(sample)
p <- p + geom_boxplot(aes(x=factor(Type), y=Word, fill=factor(Type)), notch=T, outlier.shape = NA) +
  theme(axis.text.x=element_text(angle=15, hjust=0.8, vjust=1, size=12),
        axis.text.y=element_text(size=12)) +
  guides(fill=F) +
  scale_fill_manual(values=c("skyblue1", "skyblue1", "grey60", "grey60", "grey60", "grey60", "grey60" ), guide=FALSE) +
  scale_y_continuous(limits = c(0, 30))

## reordering4.r
sample <- read.table("~/Dropbox/sample.txt", header=T, sep="\t")

p <- ggplot(sample)
p <- p + geom_boxplot(aes(x=factor(Type), y=Word, fill=factor(Type)), notch=T, outlier.shape = NA) +
  theme(axis.text.x=element_text(angle=15, hjust=0.8, vjust=1, size=12),
        axis.text.y=element_text(size=12)) +
  guides(fill=F) +
  scale_x_discrete(limits=c("NSR", "stock-related", "NTR", "ticker-related", "NEG", "NEU", "POS")) +
  scale_fill_manual(values=c("grey60", "grey60", "skyblue1", "grey60", "grey60", "skyblue1", "grey60" ), guide=FALSE) +
  scale_y_continuous(limits = c(0, 30))

## reordering3.r
sample <- read.table("~/Dropbox/sample.txt", header=T, sep="\t")
sample$Type <- factor(sample$Type, levels = c("NSR", "stock-related", "NTR", "ticker-related", "NEG", "NEU", "POS"))

p <- ggplot(sample)
p <- p + geom_boxplot(aes(x=factor(Type), y=Word, fill=factor(Type)), notch=T, outlier.shape = NA) +
  theme(axis.text.x=element_text(angle=15, hjust=0.8, vjust=1, size=12),
        axis.text.y=element_text(size=12))
  guides(fill=F) + scale_fill_grey() +
  scale_y_continuous(limits = c(0, 30))
print(p)
	%% This BibTeX bibliography file in UTF-8 format was created using Papers.
	%% http://mekentosj.com/papers/

	@article{Fu:2013p32492,
	author = {King-Wa Fu and Chung-hong Chan and Michael Chau},
	title = {Assessing Censorship on Microblogs in China: Discriminatory Keyword Analysis and Impact Evaluation of the {\&}amp;{\#} x201C; Real Name Registration{\&}amp;{\#} x201D; Policy},
	abstract = {Using 111 million microblogs collected between January 1 and June 30, 2012, we report our findings on investigating the use of microblogs in China, or weibo in Chinese, and the impact of censorship practices imposed on Chinese microbloggers. To better control for alternative explanations for censorship decisions that are attributable to an individual's characteristics and choices, we deployed a matched case-control study design to 1) determine a list of Chinese terms that discriminate the censored and uncensored posts when they are written by the same microbloggers, with the list including some homophones and puns created by
	%% This BibTeX bibliography file in UTF-8 format was created using Papers.
	%% http://mekentosj.com/papers/

	@article{Williams:2012p29780,
	author = {J Williams},
	journal = {Proceedings of the 2012 Student Research Workshop, EMNLP2012},
	title = {Extracting fine-grained durations for verbs from Twitter},
	abstract = {We seek to automatically estimate typical durations for events and habits described in Twitter tweets. A corpus of more than 14 million tweets containing temporal du- ration information was collected. These tweets were classified as to their habituality status using a bootstrapped, decision tree. For each verb lemma, associated duration information was collected for episodic and habitual uses of the verb. Summary statis- tics for 483 verb lemmas and their typical habit and episode durations has been com- piled and made available. This automati- cally generated duration information is broadly comparable to hand-annotation.},
	pages = {49},
	year = {2012},
	%% This BibTeX bibliography file in UTF-8 format was created using Papers.
	%% http://mekentosj.com/papers/

	@article{turner_praise_2012,
	author = {Julia Turner},
	journal = {The New York Times},
	title = {In Praise of the Hashtag},
	chapter = {Magazine},
	year = {2012},
	keywords = {Twitter},
	NSR SR
	184579 0 1
	184580 0 1
	184581 0 1
	184582 0 1
	184583 0 1
	184584 0 1
	184585 0 1
	184586 0 1
	184587 0 1
	1 Date Open High Low Close
	2 2012-03-27 20.12 20.14 20.01 20.04
	3 2012-03-28 20.02 21.00 19.76 20.01
	4 2012-03-29 19.82 19.97 19.71 19.95
	5 2012-03-30 20.07 20.13 19.95 20.07
	6 2012-03-31 NA NA NA NA
	7 2012-04-01 NA NA NA NA
	8 2012-04-02 20.03 20.11 19.90 20.02
	9 2012-04-03 19.97 20.02 19.80 19.96
	10 2012-04-04 19.65 19.81 19.62 19.74
	able
	abundance
	abundant
	acclaimed
	accomplish
	accomplished
	accomplishes
	accomplishing
	accomplishment
	accomplishments
	sample <- read.table("~/Dropbox/sample.txt", header=T, sep="\t")
	sample$Type <- factor(sample$Type, levels = c("NSR", "stock-related", "NTR", "ticker-related", "NEG", "NEU", "POS"))

	p <- ggplot(sample)
	p <- p + geom_boxplot(aes(x=factor(Type), y=Word, fill=factor(Type)), notch=T, outlier.shape = NA) +
	theme(axis.text.x=element_text(angle=15, hjust=0.8, vjust=1, size=12),
	axis.text.y=element_text(size=12)) +
	guides(fill=F) +
	scale_fill_manual(values=c("skyblue1", "skyblue1", "grey60", "grey60", "grey60", "grey60", "grey60" ), guide=FALSE) +
	scale_y_continuous(limits = c(0, 30))