迈 n B-Char 向 n I-Char 充 n B-Char 满 n I-Char 希 n B-Char 望 n I-Char 的 n B-Char 新 n B-Char 世 n B-Char 纪 n I-Char — n B-Char — n I-Char 一 n B-Char 九 n I-Char 九 n I-Char 八 n I-Char 年 n I-Char 新 n B-Char 年 n I-Char 讲 n B-Char 话 n I-Char ( n B-Char 附 n B-Char 图 n B-Char 片 n I-Char 1 n B-Char 张 n B-Char ) n B-Char
# -*- coding: utf-8 -*- from seqeval.metrics import f1_score from seqeval.metrics import precision_score from seqeval.metrics import accuracy_score from seqeval.metrics import recall_score from seqeval.metrics import classification_report
withopen("crf_pred.txt", "r", encoding="utf-8") as f: content = [_.strip() for _ in f.readlines()]
y_pred = [] y_true = [] for line in content: if line: y_pred.append(line.split("\t")[-1]) y_true.append(line.split("\t")[-2])
# -*- coding: utf-8 -*- import os text = "上海野生动物园群熊伤人事件救援画面曝光"
# 生成待预测的文本 withopen("predict.data", "w", encoding="utf-8") as g: for char in text: g.write("%s\tn\tB-Char\n" % char)
# 利用CRF模型,调用命令行进行预测 os.system("crf_test -m model predict.data > predict_new.txt")
# 处理预测后的进行,并将其加工成中文分词后的结果 withopen("predict_new.txt", "r", encoding="utf-8") as f: content = [_.strip() for _ in f.readlines()]
predict_tags = [] for line in content: predict_tags.append(line.split("\t")[-1])
words = [] for i inrange(len(predict_tags)): word = "" if predict_tags[i] == "B-Char": word += text[i] j = i + 1 while j < len(text) and predict_tags[j] == "I-Char": word += text[j] j += 1
# -*- coding: utf-8 -*- import os text = "上海野生动物园群熊伤人事件救援画面曝光"
# 生成待预测的文本 withopen("predict.data", "w", encoding="utf-8") as g: for char in text: g.write("%s\tn\tB-Char\n" % char)
# 利用CRF模型,调用命令行进行预测 os.system("crf_test -m model predict.data > predict_new.txt")
# 处理预测后的进行,并将其加工成中文分词后的结果 withopen("predict_new.txt", "r", encoding="utf-8") as f: content = [_.strip() for _ in f.readlines()]
predict_tags = [] for line in content: predict_tags.append(line.split("\t")[-1])
# 通过修改预测标签实现用户词典功能 withopen("user_dict.txt", "r", encoding="utf-8") as h: user_words = [_.strip() for _ in h.readlines()]
for word in user_words: t = len(word) for i inrange(len(text)-t): if text[i:i+t] == word: predict_tags[i] = "B-Char" for j inrange(i+1, i+t): predict_tags[j] = "I-Char" if i+t+1 < len(text): predict_tags[i+t+1] = "I-Char"
# 对预测标签进行后处理,得到中文分词后的结果 words = [] for i inrange(len(predict_tags)): word = "" if predict_tags[i] == "B-Char": word += text[i] j = i + 1 while j < len(text) and predict_tags[j] == "I-Char": word += text[j] j += 1