回复: Python中文分句问题
是的,但句号等标点符号会被孤零零地分到下一行,就像这样:
还是使用某种直接指定的格式
。
试试下面这个修改后的代码。
[xcode=python]
# -*- coding: utf-8 -*-
def FindTok(cutlist, char):
if char in cutlist:
return True
else:
return False
def Cut(cutlist,lines):
l = []
line = []
for i in lines:
if FindTok(cutlist,i):
line.append(i)
l.append(''.join(line))
#l.append(i)
line = []
else:
line.append(i)
return l
cutlist ="[。,,!……!《》<>\"'::?\?、\|“”‘’;]{}(){}【】(){}():?!。,;、~——+%%`:“”"'‘\n\r".decode('utf-8')
for lines in file("test.txt"):
l = Cut(list(cutlist),list(lines.decode('gbk')))
i = 0
for line in l:
if line.strip() <> "":#这里可能包含空格
li = line.strip().split()
for sentence in li:
i = i +1
print i,sentence
[/xcode]