Last active
August 29, 2015 14:17
-
-
Save 0V/29456322dbbf6f065372 to your computer and use it in GitHub Desktop.
tweets.csv からラ抜き言葉が含まれているツイートを検出して出力するコード
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using LINQtoCSV; | |
using NMeCab; | |
using System.Collections.Generic; | |
namespace RanukiCheckerConsole | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var fileName = "tweets.csv"; | |
var inputFileDescription = new CsvFileDescription | |
{ | |
SeparatorChar = ',', | |
EnforceCsvColumnAttribute = true, | |
FirstLineHasColumnNames = true | |
}; | |
var context = new CsvContext(); | |
var tweets = context.Read<TweetsCsv>(fileName, inputFileDescription); | |
// RT を削除 | |
// tweets = tweets.Where(s => !string.IsNullOrWhiteSpace(RetweetedStatusId)); | |
var mecab = MeCabTagger.Create(); | |
var ranukiList = new List<TweetsCsv>(); | |
/* | |
* UniDic フォーマット | |
* | |
* 1.品詞大分類 | |
* 2.品詞中分類 | |
* 3.品詞小分類 | |
* 4.品詞細分類 | |
* 5.活用型 | |
* 6.活用形 | |
* 7.語彙素読み | |
* 8.語彙素(語彙素表記 + 語彙素細分類) | |
* 9.書字形出現形 | |
* 10.発音形出現形 | |
* 11.書字形基本形 | |
* 12.発音形基本形 | |
* 13.語種 | |
* | |
*/ | |
// 特徴(Feature)を分割した配列のどこに何が入るかを示す定数 | |
// 出力フォーマットによって変える | |
const int InflectedForm = 4; //活用型 | |
const int Conjugate = 5; // 活用形 | |
const int LexemeReading = 6; // 語彙素読み | |
foreach (var tweet in tweets) | |
{ | |
var node = mecab.ParseToNode(tweet.Text); | |
bool checkFlag = false; | |
while (node != null) | |
{ | |
var features = node.Feature.Split(','); | |
if (checkFlag && features[LexemeReading] == "れる") | |
{ | |
ranukiList.Add(tweet); | |
break; | |
} | |
if (features[InflectedForm] == "一段" && features[Conjugate] == "未然形") | |
{ | |
checkFlag = true; | |
} | |
// カ変も判定しようとしてみる | |
// else if (features[InflectedForm].StartsWith("カ変") && features[Conjugate] == "未然形") | |
// { | |
// checkFlag = true; | |
// } | |
else | |
{ | |
checkFlag = false; | |
} | |
node = node.Next; | |
} | |
checkFlag = false; | |
} | |
context.Write(ranukiList, "ranuki_tweets.csv"); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using LINQtoCSV; | |
namespace RanukiCheckerConsole | |
{ | |
public class TweetsCsv | |
{ | |
[CsvColumn(FieldIndex = 1, Name = "tweet_id")] | |
public string Id { get; set; } | |
[CsvColumn(FieldIndex = 2, Name = "in_reply_to_status_id", CanBeNull = true)] | |
public string InReplyToStatusId { get; set; } | |
[CsvColumn(FieldIndex = 3, Name = "in_reply_to_user_id", CanBeNull = true)] | |
public string InReplyToUserId { get; set; } | |
[CsvColumn(FieldIndex = 4, Name = "timestamp", CanBeNull = true)] | |
public string TimeStamp { get; set; } | |
[CsvColumn(FieldIndex = 5, Name = "source", CanBeNull = true)] | |
public string Source { get; set; } | |
[CsvColumn(FieldIndex = 6, Name = "text", CanBeNull = true)] | |
public string Text { get; set; } | |
[CsvColumn(FieldIndex = 7, Name = "retweeted_status_id", CanBeNull = true)] | |
public string RetweetedStatusId { get; set; } | |
[CsvColumn(FieldIndex = 8, Name = "retweeted_status_user_id", CanBeNull = true)] | |
public string RetweetedStatusUserId { get; set; } | |
[CsvColumn(FieldIndex = 9, Name = "retweeted_status_timestamp", CanBeNull = true)] | |
public string RetweetedStatusUserIdTimeStamp { get; set; } | |
[CsvColumn(FieldIndex = 10, Name = "expanded_urls", CanBeNull = true)] | |
public string ExpandedUrls { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment