回复:[CODE SNIPPET] VB.NET制作词频表
以下是引用 xujiajin 在 2005-8-6 0:22:58 的发言:
dzhigner, 我是个外行,你还是帮忙看看上面的script有没有问题。谢谢。
如下是是一个类模块作为例子:
Imports System.IO
Imports System.Text
Imports System.Text.RegularExpressions
Public Class BUILD_WORDLIST_DEMO1
Dim TK As Integer
Private Function BUILDWLIST(ByVal FLNM As String, ByVal WDRegex As Regex, ByVal STRDELIMS As String, ByVal OUTPUT_UPCS As Boolean, ByVal ENCODING As Encoding, ByVal HASH2 As Hashtable) As StringBuilder
Dim WDUPCS, LINE As String
Dim WORDS As String()
Dim READER As System.IO.StreamReader
Dim HASHTBL As New Hashtable
Dim en As IEnumerator
Dim DELIMS As Char() = STRDELIMS.ToCharArray
Dim BOO As New StringBuilder
Try
READER = New System.IO.StreamReader(FLNM, ENCODING)
Do While Not READER.Peek < 0
LINE = Trim(READER.ReadLine)
If Not LINE = "" And Not LINE Is Nothing Then
WORDS = LINE.Split(DELIMS)
en = WORDS.GetEnumerator()
Do While en.MoveNext
If OUTPUT_UPCS Then
WDUPCS = CType(en.Current, String).ToUpper
If HASH2.Contains(WDUPCS) Then
WDUPCS = HASH2.Item(WDUPCS)
End If
Else
WDUPCS = CType(en.Current, String)
End If
If WDRegex.IsMatch(en.Current) Then
TK = TK + 1
If Not HASHTBL.Contains(WDUPCS) Then
HASHTBL.Add(WDUPCS, 1)
Else
HASHTBL.Item(WDUPCS) = CType(HASHTBL.Item(WDUPCS), Integer) + 1
End If
End If
Loop
End If
Loop
READER.Close()
Console.WriteLine(HASHTBL.Count)
Dim ENDIC As IDictionaryEnumerator = HASHTBL.GetEnumerator
Do While ENDIC.MoveNext
BOO.Append(CType(ENDIC.Key, String) & vbTab & CType(ENDIC.Value, String) & vbCrLf)
' HASHTBL.Remove(ENDIC.Current)
Loop
HASHTBL = Nothing
Return BOO
Catch ex As Exception
MsgBox(ex.ToString)
READER.Close()
HASHTBL = Nothing
If Not BOO Is Nothing Then
Return BOO
BOO = Nothing
Else : Return Nothing
End If
End Try
End Function
Public Sub MAIN()
TK = 0
Dim HASH As New Hashtable
Dim LINE As String
Dim WORDS As String()
Dim DELI As Char()
ReDim DELI(0)
DELI(0) = "=" '本例中使用的词元表的结构如:abolished=abolish
Dim REGX As New Regex("^\b[A-Za-z\-]+\b", 9)
Dim EC As Encoding = Encoding.UTF8
Dim STRDELI As String = ".,;:!#$^&()<>+=/\'?|`~ " & Chr(34) & Chr(32)
Dim FN As String = "D:\ENGLISH_CORPORA\RAW\BROWN_SENTENCE.TXT"
Dim SB As StringBuilder
Dim LEMREADER As StreamReader = New StreamReader("D:\ENGLISH_CORPORA\LEMMALIST.TXT", EC)
Do While Not LEMREADER.Peek < 0
LINE = Trim(LEMREADER.ReadLine)
WORDS = LINE.Split(DELI)
If WORDS.GetUpperBound(0) >= 1 Then
If Not HASH.Contains(WORDS(0).ToUpper) Then
HASH.Add(WORDS(0).ToUpper, WORDS(1).ToUpper)
End If
End If
Loop
SB = BUILDWLIST(FN, REGX, STRDELI, True, EC, HASH)
If Not SB Is Nothing Then
Console.WriteLine("TOTAL:" & TK)
Console.Write(SB.ToString)
End If
End Sub
End Class
'**************************************************************
调用以上类的方法,例子如下:
Private Sub MenuItem5_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MenuItem5.Click
Dim k As New BUILD_WORDLIST_DEMO
k.MAIN()
End Sub
‘**************************************************************
[本贴已被 作者 于 2005年08月06日 04时14分20秒 编辑过]