以英语句子为例,去除每个句子的“停用词”,“标点”
发布时间
阅读量:
阅读量
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/* * 对每一个句子去停用词、去符号、去空格
* * ***/
public class FileExcludeStopWord{
public static void main(String[] args) {
new FileExcludeStopWord().ex();
}
/** * 1:按句子读取(按行) 2:<span style="font-family: Arial, Helvetica, sans-serif;">去标点去空格</span> 3:<span style="font-family: Arial, Helvetica, sans-serif;">去停用词(去“and”“to””he“....)</span>
* */
public void ex() {
// TODO 1:按句子读取(按行)
// 读文件,文件是否存在
List<String> stringList = new ArrayList<String>();
List<String> stopWordsList = new ArrayList<String>();
// 读取文件
File file = new File("gvm_data.txt");
File stopwords=new File("StopWordTable.txt");
if (!file.exists()&&!stopwords.exists()){
System.out.println("文件不存在");
return;
}
// 循环,取一句,一行
BufferedReader reader = null;
try {
//System.out.println("以行为单位读取文件内容,一次读一整行:");
reader = new BufferedReader(new FileReader(file));
String tempString = null;
int line = 1;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
// 显示行号
tempString=tempString.toLowerCase();
//System.out.println("line " + line + ": " + tempString);
// 存入List<String>
stringList.add(tempString);
line++;
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
// TODO 2:去标点去空格
// 2.1.去掉英文标点符号后的字符串
String[] firstSubStr = new String[] { "[", "]", ".", ",", ":", "\ ",
"/", "?", "!", ";", "\"", "'" };
// 循环List ,对每一个元素替换
List<String> firstStringList = new ArrayList<String>();
for (String contance : stringList) {
// 替换
for (String str : firstSubStr) {
contance = contance.replace(str, ""); // 循环把英文标点符号替换成空,即去掉英文标点符号
}
firstStringList.add(" " + contance + " ");
}
stringList = firstStringList;
// TODO 3:去停用词(去“and”“to””he“....)
// 停用词 转化成 扫描用数组
// 调用以前写好的代码过滤
BufferedReader stops = null;
try {
String tempString = null;
stops = new BufferedReader(new FileReader(stopwords));
tempString = stops.readLine();
while ((tempString = stops.readLine()) != null) {
stopWordsList.add(tempString);
}
//如果你已经把停用词变成集合了,接下来对每个做循环作比较,但是现在是要去除字符串
//用来保存去除后的集合
List<String> tempStringList = new ArrayList<String>();
for (String string : stringList) {//取出每一行句子,对每一句分别与每个停用词做replace
for (String stopWord : stopWordsList) {//循环停用词
string = string.replace(" "+stopWord+" ", " ");
}
tempStringList.add(string);
}
stringList = tempStringList;//新集合赋值回去
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// TODO 3:去标点去空格
// 3.2.去除多余空格
String[] secondSubStr = new String[] { " " };
// 循环List ,对每一个元素替换
List<String> secondStringList = new ArrayList<String>();
for (String contance : stringList) {// 使用上一步生成的数据
// 替换
for (String str : secondSubStr) {
contance = contance.replace(str, ""); // 循环把英文标点符号替换成空,即去掉英文标点符号
}
secondStringList.add(contance);
}
stringList = secondStringList;
// 把字符串变成字符数组,循环计数
int num=0;
for (String contance : stringList) {
System.out.println(contance);
}
}
}
结果展示
problemsexistagriculture
queriedpremiernewplanagriculture
developmentfoodindustrydependsagriculture
industryagricultureimportantsectorsnationaleconomy
wrongdirectionfarmingplanet
unitedstatescontrastgoesagricultureirrigation
statefoodagriculturepointspressingquestionsanswered
invited ministersworldconferenceagricultureclimatechange
sectoreconomycagriculture ctoillustratemean
frighteningpredictionsmindtryheat-proofagriculture
australiaagriculturealtereddestroyedhalfwoodlandforests
soonagricultureresumedisaster
全部评论 (0)
还没有任何评论哟~
