Skip to content

Instantly share code, notes, and snippets.

@fumi
Created November 12, 2012 08:57
Show Gist options
  • Save fumi/4058241 to your computer and use it in GitHub Desktop.
Save fumi/4058241 to your computer and use it in GitHub Desktop.
ckan-1.8 japanese patches
--- ckan/lib/app_globals.py.bak 2012-11-08 14:04:25.366451874 +0900
+++ ckan/lib/app_globals.py 2012-11-08 14:17:06.316450977 +0900
@@ -16,13 +16,13 @@
'app_globals' variable
"""
- self.site_title = config.get('ckan.site_title', '')
+ self.site_title = config.get('ckan.site_title', '').decode("utf-8")
self.favicon = config.get('ckan.favicon',
'/images/icons/ckan.ico')
self.site_logo = config.get('ckan.site_logo', '')
self.site_url = config.get('ckan.site_url', '')
self.site_url_nice = self.site_url.replace('http://','').replace('www.','')
- self.site_description = config.get('ckan.site_description', '')
+ self.site_description = config.get('ckan.site_description', '').decode("utf-8")
self.site_about = config.get('ckan.site_about', '')
self.facets = config.get('search.facets', 'groups tags res_format license').split()
--- ckan/config/solr/schema-1.4.xml.orig 2012-11-08 15:10:37.394848259 +0900
+++ ckan/config/solr/schema-1.4.xml 2012-11-08 17:00:38.594813581 +0900
@@ -89,6 +89,63 @@
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
+
+ <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
+
+ NOTE: If you want to optimize search for precision, use default operator AND in your query
+ parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use
+ OR if you would like to optimize for recall (default).
+ -->
+ <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+ <analyzer>
+ <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
+
+ Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
+ is used to segment compounds into its parts and the compound itself is kept as synonym.
+
+ Valid values for attribute mode are:
+ normal: regular segmentation
+ search: segmentation useful for search with synonyms compounds (default)
+ extended: same as search mode, but unigrams unknown words (experimental)
+
+ For some applications it might be good to use search mode for indexing and normal mode for
+ queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+ Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+ Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+ model with your own entries for segmentation, part-of-speech tags and readings without a need
+ to specify weights. Notice that user dictionaries have not been subject to extensive testing.
+
+ User dictionary attributes are:
+ userDictionary: user dictionary filename
+ userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+ See lang/userdict_ja.txt for a sample user dictionary file.
+
+ See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
+ -->
+ <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+ <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
+ <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ <!-- Removes tokens with certain part-of-speech tags -->
+ <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncremen
ts="true"/>
+ <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
+ <filter class="solr.CJKWidthFilterFactory"/>
+ <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncremen
ts="true" />
+ <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
+ <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
+ <!-- Lower-cases romaji characters -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_ja_morph" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="fals
e">
+ <analyzer>
+ <tokenizer class="solr.JapaneseTokenizerFactory" />
+ </analyzer>
+ </fieldType>
</types>
@@ -96,7 +153,7 @@
<field name="index_id" type="string" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="site_id" type="string" indexed="true" stored="true" required="true" />
- <field name="title" type="text" indexed="true" stored="true" />
+ <field name="title" type="text_ja" indexed="true" stored="true" />
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
@@ -105,10 +162,10 @@
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
- <field name="notes" type="text" indexed="true" stored="true"/>
- <field name="author" type="textgen" indexed="true" stored="true" />
+ <field name="notes" type="text_ja" indexed="true" stored="true"/>
+ <field name="author" type="text_ja" indexed="true" stored="true" />
<field name="author_email" type="textgen" indexed="true" stored="true" />
- <field name="maintainer" type="textgen" indexed="true" stored="true" />
+ <field name="maintainer" type="text_ja" indexed="true" stored="true" />
<field name="maintainer_email" type="textgen" indexed="true" stored="true" />
<field name="license" type="string" indexed="true" stored="true" />
<field name="license_id" type="string" indexed="true" stored="true" />
@@ -119,23 +176,23 @@
<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>
- <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+ <field name="res_description" type="text_ja" indexed="true" stored="true" multiValued="true"/>
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
- <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="urls" type="text_ja" indexed="true" stored="false" multiValued="true"/>
- <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
- <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="depends_on" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="dependency_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="derives_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="has_derivation" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="links_to" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="linked_from" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="child_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
+ <field name="parent_of" type="text_ja" indexed="true" stored="false" multiValued="true"/>
<field name="views_total" type="int" indexed="true" stored="false"/>
<field name="views_recent" type="int" indexed="true" stored="false"/>
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
@@ -152,7 +209,7 @@
<field name="data_dict" type="string" indexed="false" stored="true" />
- <dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="extras_*" type="text_ja" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*" type="string" indexed="true" stored="false"/>
</fields>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment