fumi/app_globals.py.patch

## app_globals.py.patch
--- ckan/lib/app_globals.py.bak 2012-11-08 14:04:25.366451874 +0900
+++ ckan/lib/app_globals.py     2012-11-08 14:17:06.316450977 +0900
@@ -16,13 +16,13 @@
         'app_globals' variable

         """
-        self.site_title = config.get('ckan.site_title', '')
+        self.site_title = config.get('ckan.site_title', '').decode("utf-8")
         self.favicon = config.get('ckan.favicon',
                                   '/images/icons/ckan.ico')
         self.site_logo = config.get('ckan.site_logo', '')
         self.site_url = config.get('ckan.site_url', '')
         self.site_url_nice = self.site_url.replace('http://','').replace('www.','')
-        self.site_description = config.get('ckan.site_description', '')
+        self.site_description = config.get('ckan.site_description', '').decode("utf-8")
         self.site_about = config.get('ckan.site_about', '')

         self.facets = config.get('search.facets', 'groups tags res_format license').split()

## schema-1.4.xml.patch
--- ckan/config/solr/schema-1.4.xml.orig        2012-11-08 15:10:37.394848259 +0900
+++ ckan/config/solr/schema-1.4.xml     2012-11-08 17:00:38.594813581 +0900
@@ -89,6 +89,63 @@
             <filter class="solr.LowerCaseFilterFactory"/>
         </analyzer>
     </fieldType>
+
+    <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
+
+         NOTE: If you want to optimize search for precision, use default operator AND in your query
+         parser config with <solrQueryParser defaultOperator="AND"/> further down in this file.  Use
+         OR if you would like to optimize for recall (default).
+    -->
+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+      <analyzer>
+      <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
+
+           Kuromoji has a search mode (default) that does segmentation useful for search.  A heuristic
+           is used to segment compounds into its parts and the compound itself is kept as synonym.
+
+           Valid values for attribute mode are:
+              normal: regular segmentation
+              search: segmentation useful for search with synonyms compounds (default)
+            extended: same as search mode, but unigrams unknown words (experimental)
+
+           For some applications it might be good to use search mode for indexing and normal mode for
+           queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+           Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+           Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+           model with your own entries for segmentation, part-of-speech tags and readings without a need
+           to specify weights.  Notice that user dictionaries have not been subject to extensive testing.
+
+           User dictionary attributes are:
+                     userDictionary: user dictionary filename
+             userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+           See lang/userdict_ja.txt for a sample user dictionary file.
+
+           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
+        -->
+        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->

+        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
+        <filter class="solr.JapaneseBaseFormFilterFactory"/>
+        <!-- Removes tokens with certain part-of-speech tags -->
+        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncremen
ts="true"/>
+        <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
+        <filter class="solr.CJKWidthFilterFactory"/>
+        <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncremen
ts="true" />
+        <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
+        <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
+        <!-- Lower-cases romaji characters -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_ja_morph" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="fals
e">
+      <analyzer>
+        <tokenizer class="solr.JapaneseTokenizerFactory" />
+      </analyzer>
+    </fieldType>
 </types>


@@ -96,7 +153,7 @@
     <field name="index_id" type="string" indexed="true" stored="true" required="true" />
     <field name="id" type="string" indexed="true" stored="true" required="true" />
     <field name="site_id" type="string" indexed="true" stored="true" required="true" />
-    <field name="title" type="text" indexed="true" stored="true" />
+    <field name="title" type="text_ja" indexed="true" stored="true" />
     <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
     <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
     <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
@@ -105,10 +162,10 @@
     <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
     <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
     <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
-    <field name="notes" type="text" indexed="true" stored="true"/>
-    <field name="author" type="textgen" indexed="true" stored="true" />
+    <field name="notes" type="text_ja" indexed="true" stored="true"/>
+    <field name="author" type="text_ja" indexed="true" stored="true" />
     <field name="author_email" type="textgen" indexed="true" stored="true" />
-    <field name="maintainer" type="textgen" indexed="true" stored="true" />
+    <field name="maintainer" type="text_ja" indexed="true" stored="true" />
     <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
     <field name="license" type="string" indexed="true" stored="true" />
     <field name="license_id" type="string" indexed="true" stored="true" />
@@ -119,23 +176,23 @@

     <field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>

-    <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+    <field name="res_description" type="text_ja" indexed="true" stored="true" multiValued="true"/>
     <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
     <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>

     <!-- catchall field, containing all other searchable text fields (implemented
          via copyField further on in this schema  -->
-    <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="urls" type="text_ja" indexed="true" stored="false" multiValued="true"/>

-    <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
-    <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="depends_on" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="dependency_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="derives_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="has_derivation" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="links_to" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="linked_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="child_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+    <field name="parent_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
     <field name="views_total" type="int" indexed="true" stored="false"/>
     <field name="views_recent" type="int" indexed="true" stored="false"/>
     <field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
@@ -152,7 +209,7 @@

     <field name="data_dict" type="string" indexed="false" stored="true" />

-    <dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
+    <dynamicField name="extras_*" type="text_ja" indexed="true" stored="true" multiValued="false"/>
     <dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
     <dynamicField name="*" type="string" indexed="true"  stored="false"/>
 </fields>
	--- ckan/lib/app_globals.py.bak 2012-11-08 14:04:25.366451874 +0900
	+++ ckan/lib/app_globals.py 2012-11-08 14:17:06.316450977 +0900
	@@ -16,13 +16,13 @@
	'app_globals' variable

	"""
	- self.site_title = config.get('ckan.site_title', '')
	+ self.site_title = config.get('ckan.site_title', '').decode("utf-8")
	self.favicon = config.get('ckan.favicon',
	'/images/icons/ckan.ico')
	self.site_logo = config.get('ckan.site_logo', '')
	self.site_url = config.get('ckan.site_url', '')
	self.site_url_nice = self.site_url.replace('http://','').replace('www.','')
	- self.site_description = config.get('ckan.site_description', '')
	+ self.site_description = config.get('ckan.site_description', '').decode("utf-8")
	self.site_about = config.get('ckan.site_about', '')

	self.facets = config.get('search.facets', 'groups tags res_format license').split()
	--- ckan/config/solr/schema-1.4.xml.orig 2012-11-08 15:10:37.394848259 +0900
	+++ ckan/config/solr/schema-1.4.xml 2012-11-08 17:00:38.594813581 +0900
	@@ -89,6 +89,63 @@
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
	</fieldType>
	+
	+ <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
	+
	+ NOTE: If you want to optimize search for precision, use default operator AND in your query
	+ parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use
	+ OR if you would like to optimize for recall (default).
	+ -->
	+ <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
	+ <analyzer>
	+ <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
	+
	+ Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
	+ is used to segment compounds into its parts and the compound itself is kept as synonym.
	+
	+ Valid values for attribute mode are:
	+ normal: regular segmentation
	+ search: segmentation useful for search with synonyms compounds (default)
	+ extended: same as search mode, but unigrams unknown words (experimental)
	+
	+ For some applications it might be good to use search mode for indexing and normal mode for
	+ queries to reduce recall and prevent parts of compounds from being matched and highlighted.
	+ Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
	+
	+ Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
	+ model with your own entries for segmentation, part-of-speech tags and readings without a need
	+ to specify weights. Notice that user dictionaries have not been subject to extensive testing.
	+
	+ User dictionary attributes are:
	+ userDictionary: user dictionary filename
	+ userDictionaryEncoding: user dictionary encoding (default is UTF-8)
	+
	+ See lang/userdict_ja.txt for a sample user dictionary file.
	+
	+ See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
	+ -->
	+ <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
	+ <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->

	+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
	+ <filter class="solr.JapaneseBaseFormFilterFactory"/>
	+ <!-- Removes tokens with certain part-of-speech tags -->
	+ <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncremen
	ts="true"/>
	+ <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
	+ <filter class="solr.CJKWidthFilterFactory"/>
	+ <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
	+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncremen
	ts="true" />
	+ <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
	+ <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
	+ <!-- Lower-cases romaji characters -->
	+ <filter class="solr.LowerCaseFilterFactory"/>
	+ </analyzer>
	+ </fieldType>
	+
	+ <fieldType name="text_ja_morph" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="fals
	e">
	+ <analyzer>
	+ <tokenizer class="solr.JapaneseTokenizerFactory" />
	+ </analyzer>
	+ </fieldType>
	</types>


	@@ -96,7 +153,7 @@
	<field name="index_id" type="string" indexed="true" stored="true" required="true" />
	<field name="id" type="string" indexed="true" stored="true" required="true" />
	<field name="site_id" type="string" indexed="true" stored="true" required="true" />
	- <field name="title" type="text" indexed="true" stored="true" />
	+ <field name="title" type="text_ja" indexed="true" stored="true" />
	<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
	<field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
	<field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
	@@ -105,10 +162,10 @@
	<field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
	<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
	<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
	- <field name="notes" type="text" indexed="true" stored="true"/>
	- <field name="author" type="textgen" indexed="true" stored="true" />
	+ <field name="notes" type="text_ja" indexed="true" stored="true"/>
	+ <field name="author" type="text_ja" indexed="true" stored="true" />
	<field name="author_email" type="textgen" indexed="true" stored="true" />
	- <field name="maintainer" type="textgen" indexed="true" stored="true" />
	+ <field name="maintainer" type="text_ja" indexed="true" stored="true" />
	<field name="maintainer_email" type="textgen" indexed="true" stored="true" />
	<field name="license" type="string" indexed="true" stored="true" />
	<field name="license_id" type="string" indexed="true" stored="true" />
	@@ -119,23 +176,23 @@

	<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>

	- <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
	+ <field name="res_description" type="text_ja" indexed="true" stored="true" multiValued="true"/>
	<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
	<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>

	<!-- catchall field, containing all other searchable text fields (implemented
	via copyField further on in this schema -->
	- <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
	+ <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="urls" type="text_ja" indexed="true" stored="false" multiValued="true"/>

	- <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
	- <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
	+ <field name="depends_on" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="dependency_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="derives_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="has_derivation" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="links_to" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="linked_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="child_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	+ <field name="parent_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
	<field name="views_total" type="int" indexed="true" stored="false"/>
	<field name="views_recent" type="int" indexed="true" stored="false"/>
	<field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
	@@ -152,7 +209,7 @@

	<field name="data_dict" type="string" indexed="false" stored="true" />

	- <dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
	+ <dynamicField name="extras_*" type="text_ja" indexed="true" stored="true" multiValued="false"/>
	<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
	<dynamicField name="*" type="string" indexed="true" stored="false"/>
	</fields>