https://github.com/mayabot/mynlp
需求:
用户输入的连续无空格拼音流,解析为一个个字的拼音。
例如:
“shujubianhuaqushidengxiangguanxinxi”
解析为
shu, ju, bian, hua, qu, shi, deng, xiang, guan, xin, xi
引入感知机通用框架
<dependency>
<groupId>com.mayabot.mynlp</groupId>
<artifactId>mynlp-perceptron</artifactId>
<version>3.0.0</version>
</dependency>
是一个通用的、高性能感知机框架。mynlp其他感知机分词、词性标注、人名识别、NER等都是基于这个框架开发的。
当然你可以基于这个框架解决其他序列标注问题,下面我们将演示如果切分拼音流。(wanzhengdepinyin => wan zheng de pin yin)
你只需告知感知机框架两件事情:
– 特征提取函数
– label编码
– 原始语料如果转换为(输入=标签)二元组
定义感知机
class PinyinSplitDefinition : PerceptronDefinition<Char, String, CharArray>() {
override val labels = listOf("B", "M", "E", "S")
override fun labelIndex(label: String): Int {
return when (label) {
"B" -> 0
"M" -> 1
"E" -> 2
"S" -> 3
else -> 0
}
}
override fun buffer() = FastStringBuilder(4)
override fun featureFunction(sentence: CharArray, size: Int, position: Int, buffer: FastStringBuilder, emit: () -> Unit) {
val CHAR_NULL = '\u0000'
val lastIndex = size - position - 1
val pre2Char = if (position > 1) sentence[position - 2] else CHAR_NULL
val preChar = if (position > 0) sentence[position - 1] else CHAR_NULL
val curChar = sentence[position]
val nextChar = if (lastIndex > 0) sentence[position + 1] else CHAR_NULL
val next2Char = if (lastIndex > 1) sentence[position + 2] else CHAR_NULL
buffer.clear()
buffer.set2(curChar, '2')
emit()
if (position > 0) {
buffer.clear()
buffer.set2(preChar, '1')
emit()
buffer.clear()
buffer.set4(preChar, '/', curChar, '5')
emit()
if (position > 1) {
buffer.clear()
buffer.set4(pre2Char, '/', preChar, '4')
emit()
}
}
if (lastIndex > 0) {
buffer.clear()
buffer.set2(nextChar, '3')
emit()
buffer.clear()
buffer.set4(curChar, '/', nextChar, '6')
emit()
if (lastIndex > 1) {
buffer.clear()
buffer.set4(nextChar, '/', next2Char, '7')
emit()
}
}
}
override fun inputList2InputSeq(list: List<Char>): CharArray {
return list.toCharArray()
}
/**
* "世界 你好" => 世/B 界/E 你/B 好/E
* B M S E
*/
override fun annotateText(text: String): List<Pair<Char, String>> {
return text.splitToSequence('﹍')
.flatMap { word ->
when (word.length) {
0 -> emptyList()
1 -> listOf(word[0] to "S")
2 -> listOf(word[0] to "B", word[1] to "E")
3 -> listOf(word[0] to "B", word[1] to "M", word[2] to "E")
4 -> listOf(word[0] to "B", word[1] to "M", word[2] to "M", word[3] to "E")
5 -> listOf(word[0] to "B", word[1] to "M", word[2] to "M", word[3] to "M", word[4] to "E")
else -> {
val list = ArrayList<Pair<Char, String>>(word.length)
list += word[0] to "B"
for (i in 1 until word.length - 1) {
list += word[i] to "M"
}
list += word[0] to "E"
list.toList()
}
}.asSequence()
}.toList()
}
}
fun pinyinSplitEvaluateFun(id:Int,model:Perceptron,sampleList:List<String>):EvaluateResult{
var count = 0
var goldTotal = 0
var predTotal = 0
var correct = 0
val segmenter = PinyinSplitApp(model)
for (line in sampleList) {
val wordArray = line.split("﹍")
goldTotal += wordArray.size
val text = wordArray.joinToString(separator = "")
val predArray = segmenter.decodeToWordList(text)
predTotal += predArray.size
correct += wordCorrect(wordArray,predArray)
count++
}
return EvaluateResult(goldTotal, predTotal, correct)
}
训练模型
val definition = PinyinSplitDefinition()
val model = definition.train(
File("data/pinyin/yuliao_a.txt"),
File("data/pinyin/yuliao_b.txt"),80,4,false,::pinyinSplitEvaluateFun)
model.save(File("data/pinyin/model"))
语料格式为
yin﹍shi﹍zhu﹍da﹍jia﹍cang﹍cai﹍mei﹍you﹍jiu﹍sui
模型解码:
class PinyinSplitApp(val model: Perceptron) {
private val logic = PinyinSplitDefinition()
fun decodeToWordList(sentence: String): List<String> {
val result = ArrayList<String>()
val input = sentence.toCharArray()
val output = logic.decodeModel(model, input)
var p = 0
for (i in 0 until output.size) {
val f = output[i]
if (f == "S" || f == "E") {
result += sentence.substring(p, i + 1)
p = i + 1
}
}
if (p < sentence.length) {
result += sentence.substring(p, sentence.length)
}
return result
}
companion object {
const val modelPrefix = "pinyin-split-model"
fun load(file: File):PinyinSplitApp {
return PinyinSplitApp(PerceptronModel.load(file))
}
fun loadDefault():PinyinSplitApp{
return PinyinSplitApp(PerceptronModel.loadFromNlpResouce(modelPrefix))
}
}
}
测试
val app = PinyinSplitApp.load(File("data/pinyin/model")
val result = app.decodeToWordList("shujubianhuaqushidengxiangguanxinxi")
输出
shu, ju, bian, hua, qu, shi, deng, xiang, guan, xin, xi