Advertisement

以英语句子为例,去除每个句子的“停用词”,“标点”

阅读量:
复制代码
 import java.io.BufferedReader;

    
 import java.io.File;
    
 import java.io.FileReader;
    
 import java.io.IOException;
    
 import java.util.ArrayList;
    
 import java.util.List;
    
  
    
 /* * 对每一个句子去停用词、去符号、去空格
    
  * * ***/
    
  
    
 public class FileExcludeStopWord{
    
  
    
 	public static void main(String[] args) {
    
 		new FileExcludeStopWord().ex();
    
 	}
    
  
    
 	/** * 1:按句子读取(按行) 2:<span style="font-family: Arial, Helvetica, sans-serif;">去标点去空格</span> 3:<span style="font-family: Arial, Helvetica, sans-serif;">去停用词(去“and”“to””he“....)</span>
    
  
    
 	 * */
    
 	public void ex() {
    
  
    
 		// TODO 1:按句子读取(按行)
    
 		// 读文件,文件是否存在
    
 		List<String> stringList = new ArrayList<String>();
    
 		List<String> stopWordsList = new ArrayList<String>();
    
 		// 读取文件
    
 		File file = new File("gvm_data.txt");
    
 		File stopwords=new File("StopWordTable.txt");
    
 		if (!file.exists()&&!stopwords.exists()){
    
 			System.out.println("文件不存在");
    
 			return;
    
 		}
    
 		// 循环,取一句,一行
    
 		BufferedReader reader = null;
    
 		try {
    
 			//System.out.println("以行为单位读取文件内容,一次读一整行:");
    
 			reader = new BufferedReader(new FileReader(file));
    
 			String tempString = null;
    
 			int line = 1;
    
 			// 一次读入一行,直到读入null为文件结束
    
 			while ((tempString = reader.readLine()) != null) {
    
 				// 显示行号
    
 				tempString=tempString.toLowerCase();
    
 				//System.out.println("line " + line + ": " + tempString);
    
 				// 存入List<String>
    
 				stringList.add(tempString);
    
 				line++;
    
 			}
    
 			reader.close();
    
 		} catch (IOException e) {
    
 			e.printStackTrace();
    
 		} finally {
    
 			if (reader != null) {
    
 				try {
    
 					reader.close();
    
 				} catch (IOException e1) {
    
 				}
    
 			}
    
 		}
    
  
    
 		
    
 		// TODO 2:去标点去空格
    
 		// 2.1.去掉英文标点符号后的字符串
    
 		String[] firstSubStr = new String[] { "[", "]", ".", ",", ":", "\ ",
    
 				"/", "?", "!", ";", "\"", "'" };
    
 		// 循环List ,对每一个元素替换
    
 		List<String> firstStringList = new ArrayList<String>();
    
 		for (String contance : stringList) {
    
 			// 替换
    
 			for (String str : firstSubStr) {
    
 				contance = contance.replace(str, ""); // 循环把英文标点符号替换成空,即去掉英文标点符号
    
 			}
    
 			firstStringList.add(" " + contance + " ");
    
 		}
    
 		stringList = firstStringList;
    
 		
    
 		
    
 		// TODO 3:去停用词(去“and”“to””he“....)
    
 		// 停用词 转化成 扫描用数组
    
 		// 调用以前写好的代码过滤
    
  
    
 		BufferedReader stops = null;
    
 		try {
    
 			String tempString = null;
    
 			stops = new BufferedReader(new FileReader(stopwords));
    
 			tempString = stops.readLine();
    
 			while ((tempString = stops.readLine()) != null) {
    
 				stopWordsList.add(tempString);
    
 			}
    
 			//如果你已经把停用词变成集合了,接下来对每个做循环作比较,但是现在是要去除字符串
    
 			
    
 			//用来保存去除后的集合
    
 			List<String> tempStringList = new ArrayList<String>();
    
 			for (String string : stringList) {//取出每一行句子,对每一句分别与每个停用词做replace
    
 				for (String stopWord : stopWordsList) {//循环停用词
    
 					string = string.replace(" "+stopWord+" ", " ");
    
 				}
    
 				tempStringList.add(string);
    
 			}
    
 			stringList = tempStringList;//新集合赋值回去
    
 			  
    
 		} catch (IOException e) {
    
 			// TODO Auto-generated catch block
    
 			e.printStackTrace();
    
 		}
    
 		
    
 		
    
 		// TODO 3:去标点去空格
    
 		// 3.2.去除多余空格
    
 		String[] secondSubStr = new String[] { " " };
    
 		// 循环List ,对每一个元素替换
    
 		List<String> secondStringList = new ArrayList<String>();
    
 		for (String contance : stringList) {// 使用上一步生成的数据
    
 			// 替换
    
 			for (String str : secondSubStr) {
    
 				contance = contance.replace(str, ""); // 循环把英文标点符号替换成空,即去掉英文标点符号
    
 			}
    
 			secondStringList.add(contance);
    
 		}
    
 		stringList = secondStringList;
    
 		
    
 		// 把字符串变成字符数组,循环计数
    
 		int num=0;
    
 		for (String contance : stringList) {
    
 			
    
 			System.out.println(contance);
    
 			
    
 			
    
 		}
    
 	}
    
 }

结果展示

复制代码
 problemsexistagriculture

    
 queriedpremiernewplanagriculture  
    
 developmentfoodindustrydependsagriculture  
    
 industryagricultureimportantsectorsnationaleconomy  
    
 wrongdirectionfarmingplanet  
    
 unitedstatescontrastgoesagricultureirrigation  
    
 statefoodagriculturepointspressingquestionsanswered  
    
 invited ministersworldconferenceagricultureclimatechange  
    
 sectoreconomycagriculture ctoillustratemean 
    
 frighteningpredictionsmindtryheat-proofagriculture  
    
 australiaagriculturealtereddestroyedhalfwoodlandforests  
    
 soonagricultureresumedisaster

全部评论 (0)

还没有任何评论哟~