说明
藉由马可夫模型的机率特性,将语言中的最小单位-词语组成句子,再由句子组为文章
马可夫模型:一种纪录机率的统计模型,已任一状态为起点, 依照机率走放到下一阶段,走访完成时机率会为100%
程式实作
引入
with open("test.txt", encoding="utf-8") as f: sentences = f.readlines()sentences = [s.strip() for s in sentences]
引入程式文件并分割
断句
import reimport stringdelims = [ ",", "。", ";", ":", "!", "?", "?", ";", ":", "!", ",", ".", "\"", "'", "“", "‘", "’", "(", ")", "”", "(", ")", "%", "%", "@", "~", "`", "~", "`", "#", "、", "/", "\\", "<", ">", "《", "》", "/", "{", "}", "{", "}", "[", "]", "[", "]", "|", "|", "\n", "\r", " ", "\t", " ", '+', '=', '*', '^', '·']\ + list("0123456789") \ + list(string.punctuation)escaped = re.escape(''.join(delims))exclusions = '['+escaped+']'## 断句字典splitsen = []for s in sentences: cleans = re.sub(exclusions, ' ', s) subs = cleans.split() splitsen.extend(subs)## 加入空格后切割 最后存入阵列for idx, s in enumerate(splitsen): splitsen[idx] = 'S'+s+"E"## 头尾加分隔符号
断词
jieba.load_userdict('dict.txt.big')words = []for s in splitsen: ws = list(jieba.cut(s)) words.extend(ws)
建立词典
def build_word_dict(words): word_dict = {} for i in range(1, len(words)): if words[i-1] not in word_dict: word_dict[words[i-1]] = {} if words[i] not in word_dict[words[i-1]]: word_dict[words[i-1]][words[i]] = 0 word_dict[words[i-1]][words[i]] += 1 return word_dictword_dict = build_word_dict(words)print(words)print(word_dict["人"])
随机组合
# 算总次数def wordListSum(wordList): sumfreq = 0 for word, freq in wordList.items(): sumfreq += freq #print(sumfreq) return sumfreq# 依照机率分布,随机产生下一个字def retrieveRandomWord(wordList): #print(wordList) # 1~n 取乱数 randIndex = randint(1, wordListSum(wordList)) for word, freq in wordList.items(): randIndex -= freq if randIndex <= 0: return word# 产生长度100的Markov chainlength = 100chain = ""currentWord = "生活"for i in range(0, length): chain += currentWord+"" print(currentWord,"=>",word_dict[currentWord]) currentWord = retrieveRandomWord(word_dict[currentWord])#print(chain)
去句首尾
import rereply = re.split('S|E',chain)reply = [s for s in reply if s != '']for x in reply: print(x)