chaoxu/Contest2011.dyalog

## Contest2011.dyalog
:Namespace Problem1

⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

 adjacency←{
     a←(⍺[;1]⍳⍵)        ⍝ edge indexed by numbers
     mat←(2⍴(⍴⍺[;1]))⍴0 ⍝ create a empty matrix
     mat[↓a⍪⌽a]←1       ⍝ set adjacent edges to 1
     mat
 }

 allTrips←{
     (n m)←⍴⍵             ⍝ find the dimensions
     mat←⍵∨((⍳n)∘.=(⍳n))  ⍝ add one to the diagonal of the adjacency matrix
     mat((∨.∧)⍣≡)mat      ⍝ repeatly generate connectivity matrix until a fixed point
 }

 distance←{
     (a x b y)←(○(1÷180))×dms2dd¨⍺,⍵                  ⍝ radian mode. a,b are latitude x,y are longitude
     6371×(-2)○(((2○(x-y))×(2○a)×(2○b))+(1○a)×(1○b))  ⍝ the distance formula
 }

 dms2dd←{
     val←(360 60 60⊥⍵[1 2 3])÷3600 ⍝ convert to degrees
     val+(⍵[4]∊'SW')×-2×val        ⍝ if contains 'S' or 'W', degree is negative -val = (-2val+val)
 }

 reduceSegments←{
     airports←⍺
     segments←⍵
     m←⍴(↓segments)
     closure←allTrips airports adjacency segments
     dis←{
         (a b)←↓airports[⍵;2 3]
         a distance b
     }
     weight←dis¨↓(airports[;1]⍳segments)    ⍝ Find the weight of all edges
     old←segments,weight
     index←⍒weight          ⍝ order by weight
     rindex←⍋index          ⍝ this allows one to change back to the original order
     edges←(↓segments)[index]
     mst←m⍴0
     ⍝ The reverse-delete algorithm greedly delete edges without changing connectivity
     ⍝ There are other algorithms,this is easiest to code in APL
     ⍝ Kruskal's algorithm should perform better by a constant factor
     reverseDelete←{
         z←allTrips airports adjacency↑(1↓⍵,mst/edges)
         mst[m-(⍴1↓⍵)]←~z≡closure
         1↓⍵
     }
     z←(reverseDelete⍣{0=⍴⍵})edges  ⍝ apply reverse-delete until all edges are tested
     mst←mst[rindex]  ⍝ change to original order
     edges←edges[rindex]
     new←(↑mst/edges),↑mst/weight
     (old new)
 }

:EndNamespace
:Namespace Problem2
⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 21902271

 addNoise←{
      ⍝ vec ← pct addNoise vec
     w←⍵
     u←(2*31)-1       ⍝ largest number can be generated by the random generator
     r←(?(⍴⍵)⍴u)<⌈⍺×u ⍝ product a list of numbers, see how many of them < the percentage
     w[r/⍳⍴r]←'X'     ⍝ put 'X' in positions that < percentage
     w
 }

 patFind←{  ⍝ hits←text patFind(pattern tolerance)
     w←toLower⊃⍵[1]     ⍝ search only lower case
     t←0⌈(⍴w)-⍵[2]
     m←⍴⍺
     l←(⍴w),/toLower ⍺  ⍝ n-wise reduce to get a list of substrings
     z←(+/¨({w=⍵}¨l))t ⍝ find positions above certain tolerance
     z←z/⍳⍴z
     s←↑(⍴z)⍴(⊂⍳⍴w)+z-1 ⍝ select substrings above the tolerance
     ↑z,¨(⊂)¨↓⍺[s]
 }

 toLower←{
     w←⍵
     low←⎕UCS 96+⍳26
     i←⎕A⍳w
     j←(i<⍴⎕A)
     w[j/(⍳⍴j)]←low[j/i]
     w
 }

:EndNamespace
:Namespace Problem3

⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

 convolve←{
      ⍝ r←cm convolve pattern
     C←⍺
     normal←+/+/C             ⍝ produce the value for normalization
     normal←normal⌈normal0
     (py px)←⍴⍵               ⍝ produce useful number for index calculation
     (m n)←⍴⍺
     (hm hn)←⌊(m n)÷2
     w←⍵
      ⍝ the convolve algorithm for one single block
     c←{
         wrap←{1+⍺|⍵-1}     ⍝ Index get wrapped around
         y←(⍳m)+(⍺-1)-hm
         x←(⍳n)+(⍵-1)-hn
         +/+/w[py wrap y;px wrap x]×C
     }
     r←((⍳py)∘.c(⍳px))÷normal  ⍝ Do convolution on every position
     1⌊⍨r⌈0
 }

 convolveRGB←{
      ⍝ r←cm convolveRGB patterm
      ⍝ Basically, convert things in components and use convolve
      ⍝ then combine the results
     C←⍺
     p←(256 256 256⊤⍵)÷255
     p←{C convolve ⍵}¨((p[1;;])(p[2;;])(p[3;;]))
     ⊃(65536 256 1)+.×⌊p×255
 }

 gaussianMat←{   ⍝ mat←gaussianMat sigma
          ⍝ Apply the formula dirrectly
     s←2×⍵×⍵
     n←1+2×⌈3×⍵
     c←{((⍵-((n-1)÷2))-1)*2}
     v←{(1÷○s)×(*(-((c ⍺)+(c ⍵))÷s))}
     (⍳n)(∘.v)⍳n
 }

 scale←{     ⍝ r←hv scale pattern
      ⍝ simply apply expand
     (m n)←⍺
     (y x)←⍴⍵
     (x⍴n)\(y⍴m)⍀⍵
 }

 toGray←{
     m←256 256 256⊤⍵
     +⌿m÷3×255
 }

 toRGB←{
     n←⌈⍵×255
     +/n∘.×65536 256 1
 }

:EndNamespace
:Namespace Problem4

⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

 freqMat←{       ⍝   f←freqMat train
      ⍝ f return the frequency a vector
     f←{+/¨4⍴(⊂⍵)∊¨'A' 'C' 'G' 'T'}
     ⍉↑f¨,⌿⍵
 }

 makePWM←{⍟(⍵+1)÷(4++/⍵[;1])}

 score←{    ⍝   s←pwm score seq
      ⍝ just applying the formula
     n←(⍴⍺)[2]
     p←,/⍺
     f←{
         +/+/¨(,¨4⍴(⊂⍵)∊¨'A' 'C' 'G' 'T')/¨p
     }
     f¨n,/⍵
 }

 top5←{   ⍝ r←pwm top5 seq
      ⍝ apply score to all substring and figure out the best
     n←(⍴⍺)[2]
     z←(⍒⍺ score ⍵)[⍳5]
     ⍵[↑(⍴z)⍴(⊂⍳n)+z-1]
 }

:EndNamespace
:Namespace Problem5

⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

 cosine←{       ⍝ c←v1 cosine v2
     ⎕DIV←1       ⍝ make sure 0/0=0
     a←0.5*⍨⍺+.×⍺
     b←0.5*⍨⍵+.×⍵
     c←⍺+.×⍵
     c÷a×b
 }

 dictCount←{              ⍝ counts←dict dictCount string
     token←tokenize ⍵
     word←(∪token)∩⍺      ⍝ produce a list of words in the text and dict, this way
     freq←+⌿token∘.≡word  ⍝ it is faster than do a outer product of text and dict
     r←(⍴⍺)⍴0
     r[⍺⍳word]←freq
     r
 }

 makeDict←{∪⊃,/(tokenize)¨⍵}

 search←{                          ⍝ r←titles search query
     ⍝     Here I demostrate a fast algorithm (a slower but intuitive version--
     ⍝     the slowsearch, also exists in this name space)
     ⍝     The algorithm depend on the following properties
     ⍝     a cosine b = 0 whenever a[i]=0 v b[i]=0 for all i
     ⍝     Therefore one can filter out items without any word in the query.
     ⍝     There is no need to calculate the cosine with the entire dictionary
     ⍝     Only the dictionary of the titles contain the query is required, because
     ⍝     if c ← (a=0 ⍲ b=0)/a   and    d ← (a=0 ⍲ b=0)/b
     ⍝     It's easy to show a cosine b = c cosine d
     q←⍵
     qd←(∪tokenize q)              ⍝ the dictionary for the query
     z←{∨/(tokenize ⍵)∊qd}¨⍺
     t←z/⍺                         ⍝ t contains the titles that contains word in the query
     basis←(makeDict t)
     qv←(basis dictCount q)
     z←{qv cosine basis dictCount ⍵}¨t ⍝ z contains the cosine of each title with the query
     t[(10⌊⍴z)↑⍒z]                ⍝ show the top 10 that is above 0
 }

 slowsearch←{                      ⍝ identical to search, but slower
     d←makeDict ⍺
     s←d dictCount ⍵
     z←{s cosine d dictCount ⍵}¨⍺ ⍝ find cosine of query with each title
     i←⍒z
     z←(z>0)/z
     ⍺[(10⌊⍴z)↑i]                 ⍝ show the top 10 that is above 0
 }

 toLower←{
     w←⍵
     low←⎕UCS 96+⍳26
     i←⎕A⍳w
     j←(i<⍴⎕A)
     w[j/(⍳⍴j)]←low[j/i]
     w
 }

 tokenize←{                      ⍝ tokens←tokenize string
     char←⎕A,⎕D,⎕UCS 96+⍳26     ⍝ list of useful characters
     text←toLower ⍵             ⍝ make all characters lower case
     t←text∊char
     ⎕ML←3                      ⍝ use the IBM's APL, I don't see a easier
     z←t⊂text                   ⍝ way to do this with Dyalog's APL
     ⎕ML←0
     z
 }

:EndNamespace
	:Namespace Problem1

	⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

	adjacency←{
	a←(⍺[;1]⍳⍵) ⍝ edge indexed by numbers
	mat←(2⍴(⍴⍺[;1]))⍴0 ⍝ create a empty matrix
	mat[↓a⍪⌽a]←1 ⍝ set adjacent edges to 1
	mat
	}

	allTrips←{
	(n m)←⍴⍵ ⍝ find the dimensions
	mat←⍵∨((⍳n)∘.=(⍳n)) ⍝ add one to the diagonal of the adjacency matrix
	mat((∨.∧)⍣≡)mat ⍝ repeatly generate connectivity matrix until a fixed point
	}

	distance←{
	(a x b y)←(○(1÷180))×dms2dd¨⍺,⍵ ⍝ radian mode. a,b are latitude x,y are longitude
	6371×(-2)○(((2○(x-y))×(2○a)×(2○b))+(1○a)×(1○b)) ⍝ the distance formula
	}

	dms2dd←{
	val←(360 60 60⊥⍵[1 2 3])÷3600 ⍝ convert to degrees
	val+(⍵[4]∊'SW')×-2×val ⍝ if contains 'S' or 'W', degree is negative -val = (-2val+val)
	}

	reduceSegments←{
	airports←⍺
	segments←⍵
	m←⍴(↓segments)
	closure←allTrips airports adjacency segments
	dis←{
	(a b)←↓airports[⍵;2 3]
	a distance b
	}
	weight←dis¨↓(airports[;1]⍳segments) ⍝ Find the weight of all edges
	old←segments,weight
	index←⍒weight ⍝ order by weight
	rindex←⍋index ⍝ this allows one to change back to the original order
	edges←(↓segments)[index]
	mst←m⍴0
	⍝ The reverse-delete algorithm greedly delete edges without changing connectivity
	⍝ There are other algorithms,this is easiest to code in APL
	⍝ Kruskal's algorithm should perform better by a constant factor
	reverseDelete←{
	z←allTrips airports adjacency↑(1↓⍵,mst/edges)
	mst[m-(⍴1↓⍵)]←~z≡closure
	1↓⍵
	}
	z←(reverseDelete⍣{0=⍴⍵})edges ⍝ apply reverse-delete until all edges are tested
	mst←mst[rindex] ⍝ change to original order
	edges←edges[rindex]
	new←(↑mst/edges),↑mst/weight
	(old new)
	}

	:EndNamespace
	:Namespace Problem2
	⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 21902271

	addNoise←{
	⍝ vec ← pct addNoise vec
	w←⍵
	u←(2*31)-1 ⍝ largest number can be generated by the random generator
	r←(?(⍴⍵)⍴u)<⌈⍺×u ⍝ product a list of numbers, see how many of them < the percentage
	w[r/⍳⍴r]←'X' ⍝ put 'X' in positions that < percentage
	w
	}

	patFind←{ ⍝ hits←text patFind(pattern tolerance)
	w←toLower⊃⍵[1] ⍝ search only lower case
	t←0⌈(⍴w)-⍵[2]
	m←⍴⍺
	l←(⍴w),/toLower ⍺ ⍝ n-wise reduce to get a list of substrings
	z←(+/¨({w=⍵}¨l))t ⍝ find positions above certain tolerance
	z←z/⍳⍴z
	s←↑(⍴z)⍴(⊂⍳⍴w)+z-1 ⍝ select substrings above the tolerance
	↑z,¨(⊂)¨↓⍺[s]
	}

	toLower←{
	w←⍵
	low←⎕UCS 96+⍳26
	i←⎕A⍳w
	j←(i<⍴⎕A)
	w[j/(⍳⍴j)]←low[j/i]
	w
	}

	:EndNamespace
	:Namespace Problem3

	⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

	convolve←{
	⍝ r←cm convolve pattern
	C←⍺
	normal←+/+/C ⍝ produce the value for normalization
	normal←normal⌈normal0
	(py px)←⍴⍵ ⍝ produce useful number for index calculation
	(m n)←⍴⍺
	(hm hn)←⌊(m n)÷2
	w←⍵
	⍝ the convolve algorithm for one single block
	c←{
	wrap←{1+⍺\|⍵-1} ⍝ Index get wrapped around
	y←(⍳m)+(⍺-1)-hm
	x←(⍳n)+(⍵-1)-hn
	+/+/w[py wrap y;px wrap x]×C
	}
	r←((⍳py)∘.c(⍳px))÷normal ⍝ Do convolution on every position
	1⌊⍨r⌈0
	}

	convolveRGB←{
	⍝ r←cm convolveRGB patterm
	⍝ Basically, convert things in components and use convolve
	⍝ then combine the results
	C←⍺
	p←(256 256 256⊤⍵)÷255
	p←{C convolve ⍵}¨((p[1;;])(p[2;;])(p[3;;]))
	⊃(65536 256 1)+.×⌊p×255
	}

	gaussianMat←{ ⍝ mat←gaussianMat sigma
	⍝ Apply the formula dirrectly
	s←2×⍵×⍵
	n←1+2×⌈3×⍵
	c←{((⍵-((n-1)÷2))-1)*2}
	v←{(1÷○s)×(*(-((c ⍺)+(c ⍵))÷s))}
	(⍳n)(∘.v)⍳n
	}

	scale←{ ⍝ r←hv scale pattern
	⍝ simply apply expand
	(m n)←⍺
	(y x)←⍴⍵
	(x⍴n)\(y⍴m)⍀⍵
	}

	toGray←{
	m←256 256 256⊤⍵
	+⌿m÷3×255
	}

	toRGB←{
	n←⌈⍵×255
	+/n∘.×65536 256 1
	}

	:EndNamespace
	:Namespace Problem4

	⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

	freqMat←{ ⍝ f←freqMat train
	⍝ f return the frequency a vector
	f←{+/¨4⍴(⊂⍵)∊¨'A' 'C' 'G' 'T'}
	⍉↑f¨,⌿⍵
	}

	makePWM←{⍟(⍵+1)÷(4++/⍵[;1])}

	score←{ ⍝ s←pwm score seq
	⍝ just applying the formula
	n←(⍴⍺)[2]
	p←,/⍺
	f←{
	+/+/¨(,¨4⍴(⊂⍵)∊¨'A' 'C' 'G' 'T')/¨p
	}
	f¨n,/⍵
	}

	top5←{ ⍝ r←pwm top5 seq
	⍝ apply score to all substring and figure out the best
	n←(⍴⍺)[2]
	z←(⍒⍺ score ⍵)[⍳5]
	⍵[↑(⍴z)⍴(⊂⍳n)+z-1]
	}

	:EndNamespace
	:Namespace Problem5

	⎕IO ⎕ML ⎕WX ⎕RL←1 0 0 1093457855

	cosine←{ ⍝ c←v1 cosine v2
	⎕DIV←1 ⍝ make sure 0/0=0
	a←0.5*⍨⍺+.×⍺
	b←0.5*⍨⍵+.×⍵
	c←⍺+.×⍵
	c÷a×b
	}

	dictCount←{ ⍝ counts←dict dictCount string
	token←tokenize ⍵
	word←(∪token)∩⍺ ⍝ produce a list of words in the text and dict, this way
	freq←+⌿token∘.≡word ⍝ it is faster than do a outer product of text and dict
	r←(⍴⍺)⍴0
	r[⍺⍳word]←freq
	r
	}

	makeDict←{∪⊃,/(tokenize)¨⍵}

	search←{ ⍝ r←titles search query
	⍝ Here I demostrate a fast algorithm (a slower but intuitive version--
	⍝ the slowsearch, also exists in this name space)
	⍝ The algorithm depend on the following properties
	⍝ a cosine b = 0 whenever a[i]=0 v b[i]=0 for all i
	⍝ Therefore one can filter out items without any word in the query.
	⍝ There is no need to calculate the cosine with the entire dictionary
	⍝ Only the dictionary of the titles contain the query is required, because
	⍝ if c ← (a=0 ⍲ b=0)/a and d ← (a=0 ⍲ b=0)/b
	⍝ It's easy to show a cosine b = c cosine d
	q←⍵
	qd←(∪tokenize q) ⍝ the dictionary for the query
	z←{∨/(tokenize ⍵)∊qd}¨⍺
	t←z/⍺ ⍝ t contains the titles that contains word in the query
	basis←(makeDict t)
	qv←(basis dictCount q)
	z←{qv cosine basis dictCount ⍵}¨t ⍝ z contains the cosine of each title with the query
	t[(10⌊⍴z)↑⍒z] ⍝ show the top 10 that is above 0
	}

	slowsearch←{ ⍝ identical to search, but slower
	d←makeDict ⍺
	s←d dictCount ⍵
	z←{s cosine d dictCount ⍵}¨⍺ ⍝ find cosine of query with each title
	i←⍒z
	z←(z>0)/z
	⍺[(10⌊⍴z)↑i] ⍝ show the top 10 that is above 0
	}

	toLower←{
	w←⍵
	low←⎕UCS 96+⍳26
	i←⎕A⍳w
	j←(i<⍴⎕A)
	w[j/(⍳⍴j)]←low[j/i]
	w
	}

	tokenize←{ ⍝ tokens←tokenize string
	char←⎕A,⎕D,⎕UCS 96+⍳26 ⍝ list of useful characters
	text←toLower ⍵ ⍝ make all characters lower case
	t←text∊char
	⎕ML←3 ⍝ use the IBM's APL, I don't see a easier
	z←t⊂text ⍝ way to do this with Dyalog's APL
	⎕ML←0
	z
	}

	:EndNamespace