SharpICTCLAS虽说是开源的但年以后就没有人再进行维护跑搜狗的语料问题不少就连C#版本的作者也承认有不少问题
想得到更为准确的分词结果还是研究SharpICTCLAS也就是ICTCLAS 版dll 文件不是C#开发的所以引入要通过DllImport
先自己写了个类
代码
using System;
using SystemCollectionsGeneric;
using SystemText;
using SystemRuntimeInteropServices;
namespace test
{
[StructLayout(LayoutKindExplicit)]
public struct result_t
{
[FieldOffset()]
public int start;
[FieldOffset()]
public int length;
[FieldOffset()]
public int sPos;
[FieldOffset()]
public int sPosLow;
[FieldOffset()]
public int POS_id;
[FieldOffset()]
public int word_ID;
[FieldOffset()]
public int word_type;
[FieldOffset()]
public int weight;
}
class ICTCLAS
{
const string path = @ICTCLASdll;
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_Init)]
public static extern bool Init(String sInitDirPath);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_ParagraphProcess)]
public static extern String ParagraphProcess(String sParagraph int bPOStagged);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_Exit)]
public static extern bool Exit();
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_ImportUserDict)]
public static extern int ImportUserDict(String sFilename);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_FileProcess)]
public static extern bool FileProcess(String sSrcFilename String sDestFilename int bPOStagged);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_FileProcessEx)]
public static extern bool FileProcessEx(String sSrcFilename String sDestFilename);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_GetParagraphProcessAWordCount)]
public static extern int GetParagraphProcessAWordCount(String sParagraph);
//ICTCLAS_GetParagraphProcessAWordCount
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_ParagraphProcessAW)]
public static extern void ParagraphProcessAW(int nCount [Out MarshalAs(UnmanagedTypeLPArray)] result_t[] result);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_AddUserWord)]
public static extern int AddUserWord(String sWord);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_SaveTheUsrDic)]
public static extern int SaveTheUsrDic();
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_DelUsrWord)]
static extern int DelUsrWord(String sWord);
public ICTCLAS()
{
}
}
}
调用
代码
if (!ICTCLASInit(null))
{
SystemConsoleWriteLine(Init ICTCLAS failed!);
return;
}
SystemConsoleWriteLine(Init ICTCLAS Success!);
String pResult;
pResult = ICTCLASParagraphProcess(点击下载超女纪敏佳深受观众喜爱禽流感爆发在非典之后 );
SystemConsoleWriteLine(pResult);
ICTCLASExit();
注
使用的时候把ICTCLASdllConfigurexml和Data文件夹copy到程序exe运行的位置否则需要制定他们的位置
如何把pResult搞成昨天博文里的wordResult格式还是个问题还需要好好研究
附上官方网站的C#调用示例
代码
using System;
using SystemIO;
using SystemRuntimeInteropServices;
namespace win_csharp
{
[StructLayout(LayoutKindExplicit)]
public struct result_t
{
[FieldOffset()]
public int start;
[FieldOffset()]
public int length;
[FieldOffset()]
public int POS_id;
[FieldOffset()]
public int word_ID;
}
/// <summary>
/// Class 的摘要说明
/// </summary>
class Class
{
const string path = @ICTCLASdll;
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_Init)]
public static extern bool ICTCLAS_Init(String sInitDirPath);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_ParagraphProcess)]
public static extern String ICTCLAS_ParagraphProcess(String sParagraph int bPOStagged);
[DllImport(path CharSet = CharSetAnsi EntryPoint = ICTCLAS_Exit)]
public static extern bool ICTCLAS_Exit();
/// <summary>
/// 应用程序的主入口点
/// </summary>
[STAThread]
static void Main(string[] args)
{
//
// TODO: 在此处添加代码以启动应用程序
//
if (!ICTCLAS_Init(null))
{
SystemConsoleWriteLine(Init ICTCLAS failed!);
return;
}
SystemConsoleWriteLine(Init ICTCLAS Success!);
String pResult;
pResult = ICTCLAS_ParagraphProcess(点击下载超女纪敏佳深受观众喜爱禽流感爆发在非典之后 );
SystemConsoleWriteLine(pResult);
ICTCLAS_Exit();
}
}
}
参考 format Blog
ICTCLAS 官方文档