liebke (owner)

Revisions

gist: 133589 Download_button fork
public
Public Clone URL: git://gist.github.com/133589.git
Embed All Files: show embed
benford_iran.clj #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
;; Chi-square goodness-of-fit analysis of 2009 Iranian election and Benford's law
 
(use '(incanter core stats charts io))
(def votes (read-dataset "data/iran_election_2009.csv"
                         :header true))
(view votes)
 
(def regions (sel votes :cols "Region"))
 
 
(def ahmadinejad-votes (sel votes :cols "Ahmadinejad"))
(def mousavi-votes (sel votes :cols "Mousavi"))
(def rezai-votes (sel votes :cols "Rezai"))
(def karrubi-votes (sel votes :cols "Karrubi"))
 
(defn first-digit [x]
  (Character/digit (first (str x)) 10))
 
(def ahmadinejad (map first-digit ahmadinejad-votes))
(def mousavi (map first-digit mousavi-votes))
(def rezai (map first-digit rezai-votes))
(def karrubi (map first-digit karrubi-votes))
 
 
;; define function for Benford's law
(defn benford-law [d] (log10 (plus 1 (div d))))
;; calculate the probabilities for digits 1-9
(def benford-probs (benford-law (range 1 11)))
;; calculate the expected frequencies for the 30 regions
(def benford-freq (mult benford-probs (count regions)))
 
(defn get-counts [digits]
  (map #(get (:counts (tabulate digits)) % 0)
       (range 1.0 10.0 1.0)))
 
 
(doto (xy-plot (range 1 10) (get-counts ahmadinejad)
               :legend true :series-label "Ahmadinejad"
               :y-label "First digit frequency"
               :x-label "First digit"
               :title "First digit frequency by candidate")
      (add-lines (range 1 10) (get-counts mousavi)
                 :series-label "Mousavi")
      (add-lines (range 1 10) benford-freq
                 :series-label "Predicted")
      ;(add-lines (range 1 10) (get-counts rezai) :series-label "Rezai")
      ;(add-lines (range 1 10) (get-counts karrubi) :series-label "Karrubi")
      clear-background
      view)
 
 
(def ahmadinejad-test
  (chisq-test :table (get-counts ahmadinejad)
              :probs benford-probs))
(:X-sq ahmadinejad-test) ;; 5.439
(:p-value ahmadinejad-test) ;; 0.7098
 
(def mousavi-test
  (chisq-test :table (get-counts mousavi)
              :probs benford-probs))
(:X-sq mousavi-test) ;; 5.775
(:p-value mousavi-test) ;; 0.672
 
(def rezai-test
  (chisq-test :table (get-counts rezai)
              :probs benford-probs))
(:X-sq rezai-test) ;; 12.834
(:p-value rezai-test) ;; 0.118
 
(def karrubi-test
  (chisq-test :table (get-counts karrubi)
              :probs benford-probs))
(:X-sq karrubi-test) ;; 8.8696
(:p-value karrubi-test) ;; 0.353
 
 
 
;; now compare the distribution of the last digit, it should be uniform
;; based on Washington Post article:
;; http://www.washingtonpost.com/wp-dyn/content/article/2009/06/20/AR2009062000004.html?referrer=reddit
 
(defn last-digit [x]
  (Character/digit (last (str x)) 10))
 
(def ahmadinejad-last (map last-digit
                           ahmadinejad-votes))
(def mousavi-last (map last-digit
                       mousavi-votes))
(def rezai-last (map last-digit
                     rezai-votes))
(def karrubi-last (map last-digit
                       karrubi-votes))
 
 
(defn get-counts [digits]
  (map #(get (:counts (tabulate digits)) % 0)
       (range 0.0 10.0 1.0)))
 
 
 
(doto (xy-plot (range 10)
               (get-counts ahmadinejad-last)
               :legend true
               :series-label "Ahmadinejad"
               :y-label "First digit frequency"
               :x-label "First digit"
               :title "Last digit frequency by candidate")
      (add-lines (range 10) (get-counts mousavi-last)
                 :series-label "Mousavi")
      (add-lines (range 10) (get-counts rezai-last)
                 :series-label "Rezai")
      (add-lines (range 10) (get-counts karrubi-last)
                 :series-label "Karrubi")
      clear-background
      view)
 
 
(def ahmadinejad-test
  (chisq-test :table (get-counts ahmadinejad-last)))
 
(:X-sq ahmadinejad-test) ;; 4.667
(:p-value ahmadinejad-test) ;; 0.862
 
(def mousavi-test
  (chisq-test :table (get-counts mousavi-last)))
(:X-sq mousavi-test) ;; 8.667
(:p-value mousavi-test) ;; 0.469
 
(def rezai-test
  (chisq-test :table (get-counts rezai-last)))
(:X-sq rezai-test) ;; 15.333
(:p-value rezai-test) ;; 0.0822
 
(def karrubi-test
  (chisq-test :table (get-counts karrubi-last)))
(:X-sq karrubi-test) ;; 4.0
(:p-value karrubi-test) ;; 0.911