Created
November 12, 2012 08:57
-
-
Save fumi/4058241 to your computer and use it in GitHub Desktop.
ckan-1.8 japanese patches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ckan/lib/app_globals.py.bak 2012-11-08 14:04:25.366451874 +0900 | |
+++ ckan/lib/app_globals.py 2012-11-08 14:17:06.316450977 +0900 | |
@@ -16,13 +16,13 @@ | |
'app_globals' variable | |
""" | |
- self.site_title = config.get('ckan.site_title', '') | |
+ self.site_title = config.get('ckan.site_title', '').decode("utf-8") | |
self.favicon = config.get('ckan.favicon', | |
'/images/icons/ckan.ico') | |
self.site_logo = config.get('ckan.site_logo', '') | |
self.site_url = config.get('ckan.site_url', '') | |
self.site_url_nice = self.site_url.replace('http://','').replace('www.','') | |
- self.site_description = config.get('ckan.site_description', '') | |
+ self.site_description = config.get('ckan.site_description', '').decode("utf-8") | |
self.site_about = config.get('ckan.site_about', '') | |
self.facets = config.get('search.facets', 'groups tags res_format license').split() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ckan/config/solr/schema-1.4.xml.orig 2012-11-08 15:10:37.394848259 +0900 | |
+++ ckan/config/solr/schema-1.4.xml 2012-11-08 17:00:38.594813581 +0900 | |
@@ -89,6 +89,63 @@ | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
+ | |
+ <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming) | |
+ | |
+ NOTE: If you want to optimize search for precision, use default operator AND in your query | |
+ parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use | |
+ OR if you would like to optimize for recall (default). | |
+ --> | |
+ <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> | |
+ <analyzer> | |
+ <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer) | |
+ | |
+ Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic | |
+ is used to segment compounds into its parts and the compound itself is kept as synonym. | |
+ | |
+ Valid values for attribute mode are: | |
+ normal: regular segmentation | |
+ search: segmentation useful for search with synonyms compounds (default) | |
+ extended: same as search mode, but unigrams unknown words (experimental) | |
+ | |
+ For some applications it might be good to use search mode for indexing and normal mode for | |
+ queries to reduce recall and prevent parts of compounds from being matched and highlighted. | |
+ Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query. | |
+ | |
+ Kuromoji also has a convenient user dictionary feature that allows overriding the statistical | |
+ model with your own entries for segmentation, part-of-speech tags and readings without a need | |
+ to specify weights. Notice that user dictionaries have not been subject to extensive testing. | |
+ | |
+ User dictionary attributes are: | |
+ userDictionary: user dictionary filename | |
+ userDictionaryEncoding: user dictionary encoding (default is UTF-8) | |
+ | |
+ See lang/userdict_ja.txt for a sample user dictionary file. | |
+ | |
+ See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. | |
+ --> | |
+ <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> | |
+ <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> | |
+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) --> | |
+ <filter class="solr.JapaneseBaseFormFilterFactory"/> | |
+ <!-- Removes tokens with certain part-of-speech tags --> | |
+ <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncremen | |
ts="true"/> | |
+ <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> | |
+ <filter class="solr.CJKWidthFilterFactory"/> | |
+ <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> | |
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncremen | |
ts="true" /> | |
+ <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> | |
+ <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> | |
+ <!-- Lower-cases romaji characters --> | |
+ <filter class="solr.LowerCaseFilterFactory"/> | |
+ </analyzer> | |
+ </fieldType> | |
+ | |
+ <fieldType name="text_ja_morph" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="fals | |
e"> | |
+ <analyzer> | |
+ <tokenizer class="solr.JapaneseTokenizerFactory" /> | |
+ </analyzer> | |
+ </fieldType> | |
</types> | |
@@ -96,7 +153,7 @@ | |
<field name="index_id" type="string" indexed="true" stored="true" required="true" /> | |
<field name="id" type="string" indexed="true" stored="true" required="true" /> | |
<field name="site_id" type="string" indexed="true" stored="true" required="true" /> | |
- <field name="title" type="text" indexed="true" stored="true" /> | |
+ <field name="title" type="text_ja" indexed="true" stored="true" /> | |
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" /> | |
@@ -105,10 +162,10 @@ | |
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
- <field name="notes" type="text" indexed="true" stored="true"/> | |
- <field name="author" type="textgen" indexed="true" stored="true" /> | |
+ <field name="notes" type="text_ja" indexed="true" stored="true"/> | |
+ <field name="author" type="text_ja" indexed="true" stored="true" /> | |
<field name="author_email" type="textgen" indexed="true" stored="true" /> | |
- <field name="maintainer" type="textgen" indexed="true" stored="true" /> | |
+ <field name="maintainer" type="text_ja" indexed="true" stored="true" /> | |
<field name="maintainer_email" type="textgen" indexed="true" stored="true" /> | |
<field name="license" type="string" indexed="true" stored="true" /> | |
<field name="license_id" type="string" indexed="true" stored="true" /> | |
@@ -119,23 +176,23 @@ | |
<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/> | |
- <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/> | |
+ <field name="res_description" type="text_ja" indexed="true" stored="true" multiValued="true"/> | |
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/> | |
<!-- catchall field, containing all other searchable text fields (implemented | |
via copyField further on in this schema --> | |
- <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="urls" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
- <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/> | |
- <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="depends_on" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="dependency_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="derives_from" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="has_derivation" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="links_to" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="linked_from" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="child_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
+ <field name="parent_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="views_total" type="int" indexed="true" stored="false"/> | |
<field name="views_recent" type="int" indexed="true" stored="false"/> | |
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/> | |
@@ -152,7 +209,7 @@ | |
<field name="data_dict" type="string" indexed="false" stored="true" /> | |
- <dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/> | |
+ <dynamicField name="extras_*" type="text_ja" indexed="true" stored="true" multiValued="false"/> | |
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/> | |
<dynamicField name="*" type="string" indexed="true" stored="false"/> | |
</fields> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment