用java编写的源代码--制作词表

回复: 用java编写的源代码--制作词表

// here is a copy based on Oliver Mason (2000)
// Corpus.java provides the GUI
// FileTokeniser serves the purpose of spliting a file into linguistic units
// FreqList creates a freqency list

//Corpus.java
import java.awt.BorderLayout;
import java.awt.Font;
import java.awt.event.ActionEvent;
import java.io.*;
import java.util.*;
import javax.swing.AbstractAction;
import javax.swing.JFrame;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import corpus.*;


public class Corpus {
protected String corpusfile="browncorpus.txt";
protected String nl;

private class SwingAction extends AbstractAction {

SwingAction() {
super("New Action", null);
}
public void actionPerformed(ActionEvent e) {
try{
FileTokeniser ft=new FileTokeniser(corpusfile);
FreqList flist=new FreqList();
nl=System.getProperty("line.separator");
while (ft.hasMoreTokens()){
flist.add(ft.getNextToken());

}
ft.close();
PrintWriter pw=new PrintWriter(new FileWriter(corpusfile+".frq"));
Iterator a=flist.iterator();
while (a.hasNext()){
String word=(String)a.next();
int freq =flist.getFreq(word);
textArea.append(word+"\t\t"+freq+nl);
}
flist.save(pw);
pw.close();

}catch (IOException ex){};
}
}
private SwingAction action = new SwingAction();
private JTextArea textArea;
private JFrame frame;

/**
* Launch the application
* @param args
*/
public static void main(String args[]) {
try {
Corpus window = new Corpus();
window.frame.setVisible(true);
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* Create the application
*/
public Corpus() {
initialize();
}

/**
* Initialize the contents of the frame
*/
protected void initialize() {
frame = new JFrame();
frame.getContentPane().setLayout(new BorderLayout());
frame.setBounds(100, 100, 500, 375);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

final JScrollPane scrollPane = new JScrollPane();
frame.getContentPane().add(scrollPane);

textArea = new JTextArea();
textArea.setEditable(false);
textArea.setFont(new Font("Sans", Font.PLAIN, 14));
scrollPane.setViewportView(textArea);

final JMenuBar menuBar = new JMenuBar();
frame.setJMenuBar(menuBar);

final JMenu corpusMenu = new JMenu();
corpusMenu.setText("WordList");
menuBar.add(corpusMenu);

final JMenuItem openMenuItem = new JMenuItem();
openMenuItem.setAction(action);

openMenuItem.setText("BrownCorpus");
corpusMenu.add(openMenuItem);

corpusMenu.addSeparator();

final JMenu sortMenu_1 = new JMenu();
sortMenu_1.setText("Sort");
corpusMenu.add(sortMenu_1);

final JMenuItem ascendMenuItem = new JMenuItem();
ascendMenuItem.setText("Ascend");
sortMenu_1.add(ascendMenuItem);

final JMenuItem descendMenuItem = new JMenuItem();
descendMenuItem.setText("Descend");
sortMenu_1.add(descendMenuItem);

final JMenu sortMenu = new JMenu();
sortMenu.setText("Concordance");
menuBar.add(sortMenu);

final JMenuItem findMenuItem = new JMenuItem();
findMenuItem.setText("Find...");
sortMenu.add(findMenuItem);

final JMenuItem preferencesMenuItem = new JMenuItem();
preferencesMenuItem.setText("Preferences");
sortMenu.add(preferencesMenuItem);

final JMenu collocationMenu = new JMenu();
collocationMenu.setText("Collocation");
menuBar.add(collocationMenu);

final JMenuItem miscoreMenuItem = new JMenuItem();
miscoreMenuItem.setText("MI Score");
collocationMenu.add(miscoreMenuItem);

final JMenuItem tScoreMenuItem = new JMenuItem();
tScoreMenuItem.setText("T Score");
collocationMenu.add(tScoreMenuItem);

final JMenuItem zScoreMenuItem = new JMenuItem();
zScoreMenuItem.setText("Z Score");
collocationMenu.add(zScoreMenuItem);

final JMenuItem preferencesMenuItem_1 = new JMenuItem();
preferencesMenuItem_1.setText("Preferences");
collocationMenu.add(preferencesMenuItem_1);

final JMenu helpMenu = new JMenu();
helpMenu.setText("Help");
menuBar.add(helpMenu);

final JMenuItem aboutMenuItem = new JMenuItem();
aboutMenuItem.setText("About");
helpMenu.add(aboutMenuItem);
}

}
// FileTokeniser.java

package corpus;

import java.io.IOException;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.StringTokenizer;

public class FileTokeniser

{
private BufferedReader input=null;
private StringTokenizer tokeniser =null;
private String nextToken=null;

/**constructor*/
public
FileTokeniser(String infile)
throws IOException{
input= new BufferedReader(new FileReader(infile));
do {
String line=input.readLine();
tokeniser=new StringTokenizer(PreTokeniser.tokenise(line));
}while (!tokeniser.hasMoreTokens());
nextToken=tokeniser.nextToken();
}
/**check if more tokens are available*/
public boolean
hasMoreTokens(){
if (nextToken==null){
return(false);
}else{
return(true);
}}
/** read the next token*/
public String
getNextToken(){
String retval=nextToken;
if (tokeniser.hasMoreTokens()){//more available on this line
nextToken=tokeniser.nextToken();
}else{//read the next line
try{
nextToken=null;
String line=input.readLine();
while (line!=null && nextToken==null){
if (line!=null){
tokeniser=new StringTokenizer(PreTokeniser.tokenise(line));
if (tokeniser.hasMoreTokens()){
nextToken=tokeniser.nextToken();
}else{
line=input.readLine();
}
}else{
input.close();
}
}
}catch (IOException exc){
System.err.println("FileTokeniser: "+exc);
}
}
return(retval);
}
/**close the input file*/
public void
close()
throws IOException{
input.close();
input=null;
}
//FreqList.java
package corpus;


import java.util.*;
import java.io.*;
/**
* An implementation of a frequency list
*/
public class FreqList{
private Map storage;
private int total=0;
public
FreqList(){
storage=new HashMap();
}
public void
add (String word){
int value[]=(int[])storage.get(word);
if (value==null){
value=new int[1];
storage.put(word,value);
}
value[0]++;
total++;
}
public int
getFreq(String word){
int retval=0;
int value[]=(int[])storage.get(word);
if (value!=null){
retval=value[0];
}
return(retval);
}
public int
getN(){
return(total);
}
public Iterator
iterator(){
return(storage.keySet().iterator());
}
public void
save(PrintWriter pw){
Iterator it=storage.keySet().iterator();
while (it.hasNext()){
String word=(String)it.next();
int freq =getFreq(word);
pw.println(word+" "+freq);
}
}
public void
load(BufferedReader br)
throws IOException {
String line=br.readLine();
while (line!=null){
StringTokenizer st=new StringTokenizer(line);
if (st.countTokens()<2){
System.err.println("Insufficient line: '"+line+" '");
}else if(st.countTokens()==2){
insert(st.nextToken(),Integer.parseInt(st.nextToken()));
}else{
StringBuffer word=new StringBuffer(st.nextToken());
while (st.countTokens()>2) {
word.append(' ');
word.append(st.nextToken());
}
insert(word.toString(),Integer.parseInt(st.nextToken()));
}
line=br.readLine();
}
}
private void
insert(String word, int freq){
int value[]=new int[1];
total+=freq;
freq+=getFreq(word);
value[0]=freq;
storage.put(word,value);
}
}
//the end
 
回复: 用java编写的源代码--制作词表

都是高手啊,给俺源代码还是不会用。不过还是感谢共享!
 
回复: 用java编写的源代码--制作词表

//According to Oliver Mason (2000), FileTokeniser.java needs a utility class to support it
// Here is the supporting class PreTokeniser.java
/* PreTokeniser.java*/

import java.util.StringTokenizer;

public class PreTokeniser{
public static final String PUNC="!$\"\u00a3%^&*()_+=#{};:'`/?,. \t\n";
public static String
tokenise (String line){
StringTokenizer st=new StringTokenizer(line,PUNC,true);
String tok1="";
String tok2="";
boolean skip=false;
StringBuffer retval=new StringBuffer();
while (st.hasMoreTokens()){
tok1=tok2;
tok2=st.nextToken();
if (",".equals(tok1)
||".".equals(tok1)
||"-".equals(tok1)
||"'".equals(tok1)) {//special punc mark
if (Character.isWhitespace(tok2.charAt(0))){
tok1=" "+tok1;
}else{
skip=true;
}
}else {
if (skip==false){
tok1=" "+tok1;
}
skip=false;
}
retval.append(tok1);
}
if (skip==false){
tok2=" "+tok2;
}
retval.append(tok2);
return(retval.toString());
}
}
 
Back
顶部