Summary of Week 4 - Caesar Cipher
- 共有 6 位參與者
- encode 的實作大同小異,但 decode 的方法就各異其趣
- decoding 過程中要對於每個可能的明文去評分,而評分的方法主要分為兩種:
- 將字母出現頻率加總,找出最高的那組
- 建出明文的字母頻率表,並與英文字母頻率表比較「距離」,找出最小的那組
- 有人使用字母頻率的排名,而不是頻率本身去計算,但還是解得出來!
- 有人發現 decoding 過程其實可以寫成某種 convolution(小編終於知道以前大一修微積分是幹嘛用的了!)
- 大家建表所選擇的資料結構有很多種(List, Array, Map),但相對於密文大小的時間複雜度應該都是一樣的
Solutions
- 江宗儒
https://gist.github.com/ray851107/dc34c8fd214701474a6a774ad7f1b339
- decode 的方法是去找密文每個字母頻率加起來最高的那組
import Data.Array
import Data.Char
import Data.Function
import Data.List
ord' :: Char -> Int
ord' = subtract (ord 'A') . ord
chr' :: Int -> Char
chr' = chr . (+ ord 'A') . (`mod` 26)
encode :: Int -> String -> String
encode i = map (chr' . (+ i) . ord')
decode' :: Int -> String -> String
decode' = encode . negate
decode :: String -> (String, Int)
decode s =
maximumBy (compare `on` (score . fst)) [(decode' i s, i) | i <- [0 .. 25]]
score :: String -> Double
score = sum . map (logProbs !)
logProbs :: Array Char Double
logProbs =
listArray ('A', 'Z') . map log $
[ 0.08167
, 0.01492
, 0.02782
, 0.04253
, 0.12702
, 0.02228
, 0.02015
, 0.06094
, 0.06966
, 0.00153
, 0.00772
, 0.04025
, 0.02406
, 0.06749
, 0.07507
, 0.01929
, 0.00095
, 0.05987
, 0.06327
, 0.09056
, 0.02758
, 0.00978
, 0.02360
, 0.00150
, 0.01974
, 0.00074
]
- 陳亮廷
https://gist.github.com/L-TChen/add31629e2f6c8e2ea1416646df7fc86
- decode 的方法是將所有可能明文的字母頻率表,與英語的字母頻率表做比較,找距離(Euclidean)最小的那組
{-# LANGUAGE UnicodeSyntax #-}
{-# LANGUAGE TupleSections #-}
module Caesar where
import Data.Map (Map, fromList, fromListWith, unionWith)
import Data.Char
import Data.List
import Data.Function
encode ∷ Int → String → String
encode n str = rotate n <$> str
decode ∷ String → (String, Int)
decode xs = fst $ minimumBy (compare `on` snd) $ scores xs
scores ∷ String → [((String, Int), Float)]
scores xs = do
i ← [0..25]
let cand = encode (-i) xs in
return ((cand, i), distance refFreqTab $ mkFreqTab cand)
-- Rotate English alphabet only
rotate ∷ Int → Char → Char
rotate m x = case (isAsciiUpper x, isAsciiLower x) of
(True, _) → chr $ ((m + ord x - ord 'A') `mod` 26) + ord 'A'
(_, True) → chr $ ((m + ord x - ord 'a') `mod` 26) + ord 'a'
_ → x
-- Euclidean distance
distance ∷ Map Char Float → Map Char Float → Float
distance xs ys = sqrt $ sum $ unionWith (\x y → (x - y)^2) xs ys
-- Make a frequence table for English alphabet
mkFreqTab ∷ String → Map Char Float
mkFreqTab xs = (100/total *) <$> fromListWith (+) (((,1) <$> xs') ++ ((,0) <$> ['A'..'Z']))
where xs' = filter isAsciiUpper (toUpper <$> xs)
total = fromIntegral $ length xs'
-- https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_the_English_language
refFreqTab ∷ Map Char Float
refFreqTab = fromList
[('E', 12.702), ('T', 9.056), ('A', 8.167), ('O', 7.507), ('I', 6.966), ('N', 6.749),
('S', 6.749), ('H', 6.094), ('R', 5.987), ('D', 4.253), ('L', 4.025), ('C', 2.782),
('U', 2.758), ('M', 2.406), ('W', 2.36), ('F', 2.228), ('G', 2.015), ('Y', 1.974),
('P', 1.929), ('B', 1.492), ('V', 0.978), ('K', 0.772), ('J', 0.153), ('X', 0.15),
('Q', 0.095), ('Z', 0.074)]
- 郭宗霆
https://gist.github.com/jc99kuo/eb0715d347f3cffe21bba1057447d2dd
- decode 的做法也是去找密文每個字母頻率加起來最高的那組
-- FLOLAC 2018 Week 4 -- Caesar Cipher & Decipher
module CaesarCipher (encode, decode) where
import Data.List
letStart = 'A'
letFinal = 'Z'
letSeque = [letStart .. letFinal]
letQty = length letSeque
-- postShift i char
-- shift upper case character by i position and leave other characters unchanged
posShift i char =
case elemIndex char letSeque of
Just m -> letSeque !! mod (m + i) letQty
Nothing -> char
-- encode i str
-- encrypt str by shifting i postion of upper case characters in str
encode :: Int -> String -> String
encode i = map (posShift i)
-- decode str
-- try to decrypt str by matching the char distribution
decode :: String -> (String, Int)
decode str = (map (posShift bestInd ) str, letQty - bestInd)
where
bestInd = fst $ maximumBy (\x y -> compare(snd x) (snd y))
[(i, simScore i msqFreq letFreq) | i <- [1 .. letQty]]
simScore i list1 list2 = foldl1 (+) (zipWith (*) ((replicate i 0) ++ list1) (cycle list2))
msqFreq = [ fromIntegral . length $ filter (== c) str | c <- letSeque ]
letFreq :: [Double]
letFreq =
[ 0.08167 -- 'A'
, 0.01492 -- 'B'
, 0.02782 -- 'C'
, 0.04253 -- 'D'
, 0.12702 -- 'E'
, 0.02228 -- 'F'
, 0.02015 -- 'G'
, 0.06094 -- 'H'
, 0.06966 -- 'I'
, 0.00153 -- 'J'
, 0.00772 -- 'K'
, 0.04025 -- 'L'
, 0.02406 -- 'M'
, 0.06749 -- 'N'
, 0.07507 -- 'O'
, 0.01929 -- 'P'
, 0.00095 -- 'Q'
, 0.05987 -- 'R'
, 0.06327 -- 'S'
, 0.09056 -- 'T'
, 0.02758 -- 'U'
, 0.00978 -- 'V'
, 0.02360 -- 'W'
, 0.00150 -- 'X'
, 0.01974 -- 'Y'
, 0.00074 -- 'Z'
]
{- Testing via GHCi --
*CaesarCipher> encode 23 "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"
"QEB NRFZH YOLTK CLU GRJMP LSBO QEB IXWV ALD"
*CaesarCipher> decode "QEB NRFZH YOLTK CLU GRJMP LSBO QEB IXWV ALD"
("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG",23)
*CaesarCipher> encode 13 "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"
"GUR DHVPX OEBJA SBK WHZCF BIRE GUR YNML QBT"
*CaesarCipher> decode "GUR DHVPX OEBJA SBK WHZCF BIRE GUR YNML QBT"
("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG",13)
*CaesarCipher> encode 3 "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"
"WKH TXLFN EURZQ IRA MXPSV RYHU WKH ODCB GRJ"
*CaesarCipher> decode "WKH TXLFN EURZQ IRA MXPSV RYHU WKH ODCB GRJ"
("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG",3)
-}
- Tai An Su
https://gist.github.com/taiansu/fbfb4c350b2e8ef00fdcf927eba5d3e9
- decode 的方法是將所有可能明文的字母頻率表,與英語的字母頻率表做比較,找距離(Manhattan)最小的那組
import Data.List (group, sort, sortOn)
import qualified Data.Map as M
encode :: Int -> String -> String
encode n = fmap (fetchChar . shift . fromEnum) where
shift = (+ n) . subtract 65
fetchChar = (cycle ['A'..'Z'] !!)
decode :: String -> (String, Int)
decode str = fst . head $ sortOn snd matrix where
matrix = [ row offset str | offset <- [0..25]]
row :: Int -> String -> ((String, Int), Double)
row offset str = ((encoded, offset), likelihood) where
encoded = encode offset str
grouped = group . sort $ encoded
likelihood = foldr sumDistance 0.0 grouped
sumDistance xs acc = acc + distance xs
distance xs = abs $ M.findWithDefault 0.0 (head xs) distribution - freq xs
freq xs = fromIntegral (length xs) / len
len = fromIntegral $ length str
distribution :: M.Map Char Double
distribution = M.fromList [ ('A', 0.08167) , ('B', 0.01492) , ('C', 0.02782) , ('D', 0.04253)
, ('E', 0.02702) , ('F', 0.02228) , ('G', 0.02015) , ('H', 0.06094)
, ('I', 0.06966) , ('J', 0.00153) , ('K', 0.00772) , ('L', 0.04025)
, ('M', 0.02406) , ('N', 0.06749) , ('O', 0.07507) , ('P', 0.01929)
, ('Q', 0.00095) , ('R', 0.05987) , ('S', 0.06327) , ('T', 0.09056)
, ('U', 0.02758) , ('V', 0.00978) , ('W', 0.02360) , ('X', 0.00150)
, ('Y', 0.01974) , ('Z', 0.00074)]
- Yu-Ren Pan
https://gist.github.com/YuRen-tw/7d852147893b97d60b7eb49c55542be0
- decode 的方法很特別,因為比較用的不是字母頻率表,而是字母出現頻率的排名
- aarrr
import Data.Char (ord, chr)
import Data.List
import Data.Function (on)
encode :: Int -> String -> String
encode n = map (shift n)
decode :: String -> (String, Int)
decode = aarrr mkPair (flip minimumBy [0..25] . compareOnDiff . count)
where aarrr f g x = f x $ g x
mkPair xs n = (encode n xs, mod (26-n) 26)
compareOnDiff c = compare `on` diff c
diff :: (Char -> Int) -> Int -> Int
diff c n = sum . change . sort' $ rank
where change = zipWith (\x y -> abs $ x - snd y) [0..25]
sort' = sortBy (flip compare `on` (count' . fst))
count' = c . shift (mod (26-n) 26)
count :: Eq a => [a] -> a -> Int
count = foldl count' (const 0)
where count' :: Eq a => (a -> Int) -> a -> (a -> Int)
count' f x y | x == y = f y + 1
| otherwise = f y
shift :: Int -> Char -> Char
shift n = chr . (\x -> mod (x-oA + n) 26 + oA) . ord
where oA = ord 'A'
-- https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_the_English_language
-- ETAOINSHRDLCUMWFGYPBVKJXQZ
rank :: [(Char, Int)]
rank = [('A', 2), ('B', 19), ('C', 11), ('D', 9),
('E', 0), ('F', 15), ('G', 16), ('H', 7),
('I', 4), ('J', 22), ('K', 21), ('L', 10),
('M', 13), ('N', 5), ('O', 3), ('P', 18),
('Q', 24), ('R', 8), ('S', 6), ('T', 1),
('U', 12), ('V', 20), ('W', 14), ('X', 23),
('Y', 17), ('Z', 25)]
- 洪崇凱
https://gist.github.com/RedBug312/03f3fd6196539ce076763da0aa83f21c
- 用 convolution 去解釋超有畫面的!!!!
- decode 的做法也是去找密文每個字母頻率加起來最高的那組(如果小編沒有眼花的話)
encode :: Int -> String -> String
encode key plain = [shift key p | p <- plain]
where shift n c = ['A'..'Z'] !! mod (fromEnum c - fromEnum 'A' + n) 26
decode :: String -> (String, Int)
decode cipher = let key = (snd.maximum) search in (encode (-key) cipher, key)
where search = zip (drop 25 $ convolve cipher_stats (letter_freqs ++ letter_freqs)) (0:[25,24..1])
cipher_stats = [fromIntegral.length $ filter (==c) cipher | c <- ['A'..'Z']]
-- 1D discrete convolution altered from stackoverflow.com/a/39784716
-- Since SSE_ij := Σ(a_i - b_j)^2 = Σa_i^2 + Σb_j^2 - 2Σa_ib_j,
-- we can minimize SSE by maximizing Σa_ib_j
convolve :: [Double] -> [Double] -> [Double]
convolve xs ys = convolve' (reverse xs) ys
where convolve' [] ys = []
convolve' (x:xs) ys = add (map (*x) ys) (0 : convolve' xs ys)
add xs ys = if length xs >= length ys
then zipWith (+) xs (ys ++ repeat 0)
else add ys xs
letter_freqs :: [Double]
letter_freqs = [0.08167, 0.01492, 0.02782, 0.04253, 0.12702,
0.02228, 0.02015, 0.06094, 0.06966, 0.00153,
0.00772, 0.04025, 0.02406, 0.06749, 0.07507,
0.01929, 0.00095, 0.05987, 0.06327, 0.09056,
0.02758, 0.00978, 0.02360, 0.00150, 0.01974,
0.00074]
{-
*Main> encode 10 "THEQUICKBROWNFOXJUMPSOVERALAZYDOG"
"DROAESMULBYGXPYHTEWZCYFOBKVKJINYQ"
*Main> decode "DROAESMULBYGXPYHTEWZCYFOBKVKJINYQ"
("THEQUICKBROWNFOXJUMPSOVERALAZYDOG",10)
-}