Skip to content

Instantly share code, notes, and snippets.

@0V
Last active August 29, 2015 14:17
Show Gist options
  • Save 0V/29456322dbbf6f065372 to your computer and use it in GitHub Desktop.
Save 0V/29456322dbbf6f065372 to your computer and use it in GitHub Desktop.
tweets.csv からラ抜き言葉が含まれているツイートを検出して出力するコード
using LINQtoCSV;
using NMeCab;
using System.Collections.Generic;
namespace RanukiCheckerConsole
{
class Program
{
static void Main(string[] args)
{
var fileName = "tweets.csv";
var inputFileDescription = new CsvFileDescription
{
SeparatorChar = ',',
EnforceCsvColumnAttribute = true,
FirstLineHasColumnNames = true
};
var context = new CsvContext();
var tweets = context.Read<TweetsCsv>(fileName, inputFileDescription);
// RT を削除
// tweets = tweets.Where(s => !string.IsNullOrWhiteSpace(RetweetedStatusId));
var mecab = MeCabTagger.Create();
var ranukiList = new List<TweetsCsv>();
/*
* UniDic フォーマット
*
* 1.品詞大分類
* 2.品詞中分類
* 3.品詞小分類
* 4.品詞細分類
* 5.活用型
* 6.活用形
* 7.語彙素読み
* 8.語彙素(語彙素表記 + 語彙素細分類)
* 9.書字形出現形
* 10.発音形出現形
* 11.書字形基本形
* 12.発音形基本形
* 13.語種
*
*/
// 特徴(Feature)を分割した配列のどこに何が入るかを示す定数
// 出力フォーマットによって変える
const int InflectedForm = 4; //活用型
const int Conjugate = 5; // 活用形
const int LexemeReading = 6; // 語彙素読み
foreach (var tweet in tweets)
{
var node = mecab.ParseToNode(tweet.Text);
bool checkFlag = false;
while (node != null)
{
var features = node.Feature.Split(',');
if (checkFlag && features[LexemeReading] == "れる")
{
ranukiList.Add(tweet);
break;
}
if (features[InflectedForm] == "一段" && features[Conjugate] == "未然形")
{
checkFlag = true;
}
// カ変も判定しようとしてみる
// else if (features[InflectedForm].StartsWith("カ変") && features[Conjugate] == "未然形")
// {
// checkFlag = true;
// }
else
{
checkFlag = false;
}
node = node.Next;
}
checkFlag = false;
}
context.Write(ranukiList, "ranuki_tweets.csv");
}
}
}
using LINQtoCSV;
namespace RanukiCheckerConsole
{
public class TweetsCsv
{
[CsvColumn(FieldIndex = 1, Name = "tweet_id")]
public string Id { get; set; }
[CsvColumn(FieldIndex = 2, Name = "in_reply_to_status_id", CanBeNull = true)]
public string InReplyToStatusId { get; set; }
[CsvColumn(FieldIndex = 3, Name = "in_reply_to_user_id", CanBeNull = true)]
public string InReplyToUserId { get; set; }
[CsvColumn(FieldIndex = 4, Name = "timestamp", CanBeNull = true)]
public string TimeStamp { get; set; }
[CsvColumn(FieldIndex = 5, Name = "source", CanBeNull = true)]
public string Source { get; set; }
[CsvColumn(FieldIndex = 6, Name = "text", CanBeNull = true)]
public string Text { get; set; }
[CsvColumn(FieldIndex = 7, Name = "retweeted_status_id", CanBeNull = true)]
public string RetweetedStatusId { get; set; }
[CsvColumn(FieldIndex = 8, Name = "retweeted_status_user_id", CanBeNull = true)]
public string RetweetedStatusUserId { get; set; }
[CsvColumn(FieldIndex = 9, Name = "retweeted_status_timestamp", CanBeNull = true)]
public string RetweetedStatusUserIdTimeStamp { get; set; }
[CsvColumn(FieldIndex = 10, Name = "expanded_urls", CanBeNull = true)]
public string ExpandedUrls { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment