From 1dbb3a06ff30aca0d260505470efdce498dc1183 Mon Sep 17 00:00:00 2001
From: Meiqi Guo <mei-qi.guo@student.ecp.fr>
Date: Fri, 17 Mar 2017 21:49:59 +0100
Subject: [PATCH] Delete Preprocess.java

---
 output/Preprocess.java | 172 -----------------------------------------
 1 file changed, 172 deletions(-)
 delete mode 100644 output/Preprocess.java

diff --git a/output/Preprocess.java b/output/Preprocess.java
deleted file mode 100644
index de85b0b..0000000
--- a/output/Preprocess.java
+++ /dev/null
@@ -1,172 +0,0 @@
-package ecp.BDPA.assignment2;
-import org.apache.commons.lang.StringUtils;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-public class Preprocess extends Configured implements Tool {
-   private enum FinalLineNumCounter{
-	   Final_NUM
-   }
-   private enum LineNumCounter{
-	   NUM
-   }
-   
-   public static void main(String[] args) throws Exception {
-      System.out.println(Arrays.toString(args));
-      Configuration conf = new Configuration();
-      conf.set("StopWordsFileName", args[2]);
-      conf.set("WordFreqFileName", args[3]);
-      conf.set("mapreduce.map.output.compress", "true");
-      int res = ToolRunner.run(conf, new Preprocess(), args);
-      
-      System.exit(res);
-   }
-   @Override
-   public int run(String[] args) throws Exception {
-      System.out.println(Arrays.toString(args));
-      Job job = new Job(getConf(), "Preprocess");
-      job.setJarByClass(Preprocess.class);
-      job.setOutputKeyClass(LongWritable.class);
-      job.setOutputValueClass(Text.class);
-      job.setMapperClass(Map.class);
-      job.setReducerClass(Reduce.class);
-      job.setNumReduceTasks(1);
-      
-      job.setInputFormatClass(TextInputFormat.class);
-      job.setOutputFormatClass(TextOutputFormat.class);
-      FileInputFormat.addInputPath(job, new Path(args[0]));
-      FileOutputFormat.setOutputPath(job, new Path(args[1]));
-      job.waitForCompletion(true);
-      
-      return 0;
-   }
-   
-   public static class Map extends Mapper<LongWritable, Text, LongWritable, Text> {
-	  private Text word = new Text();
-      private Text words = new Text();
-      public List<Text> stopWords = new ArrayList<Text>(); 
-      public HashMap<String,Integer> wordFreq = new HashMap<String,Integer>(); 
-      public void loadWordFreq(String filename) throws IOException{
-    	  Path pt=new Path(filename);//Location of file in HDFS
-          FileSystem fs = FileSystem.get(new Configuration());
-    	  BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt)));
-    		  
-    	  String sCurrentLine;
-    	      
-          while ((sCurrentLine = br.readLine()) != null) {
-        	  String[] wordfreq = sCurrentLine.split("[\\s]+");
-        	  if (wordfreq.length != 2){
-        		  System.out.println("WARNING: WRONG INPUT FORMAT");
-        	  }
-        	  this.wordFreq.put(wordfreq[0], new Integer(wordfreq[1]));
-    	  }
-          
-          br.close();
-          
-          return;
-       }
-      public void loadStopWords(String filename) throws IOException{
-    	  Path pt=new Path(filename);//Location of file in HDFS
-          FileSystem fs = FileSystem.get(new Configuration());
-    	  BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt)));
-    		  
-    	  String sCurrentLine;
-    	      
-          while ((sCurrentLine = br.readLine()) != null) {
-        	  String stopWord = sCurrentLine.replaceAll("[^A-Za-z]+", "");
-        	  Text t = new Text();
-        	  t.set(stopWord);
-        	  this.stopWords.add(t);
-    	  }
-          
-          br.close();
-          
-          return;
-       }
-      
-      public void setup(Context context) throws IOException,InterruptedException { 
-    	 super.setup(context);
-    	 String filename = context.getConfiguration().get("StopWordsFileName");
-    	 loadStopWords(filename);
-    	 loadWordFreq(context.getConfiguration().get("WordFreqFileName"));
-    	 
-      }
-      
-      @Override
-      public void map(LongWritable key, Text value, Context context)
-              throws IOException, InterruptedException {
-    	 Counter counter = context.getCounter(LineNumCounter.NUM);
-    	 Counter counter_final = context.getCounter(FinalLineNumCounter.Final_NUM);
-    	 counter.increment(1);
-    	 Set<String> wordSet = new HashSet<String>();
-    	 if (value.toString().isEmpty()){
-    		 return;
-    	 }	
-         for (String token: value.toString().split("\\s+|-{2,}+")) {
-        	 word.set(token.replaceAll("[^A-Za-z0-9]+", "").toLowerCase());
-        	 if(word.toString().length()==0){
-        		 continue;
-        	 }
-        	 else if (stopWords.contains(word)){
-        		 continue;
-        	 }else if(!wordFreq.containsKey(word.toString())){
-        		 System.out.println("WARN: HASHTABLE DON'T HAVE WORD:");
-        		 System.out.println(word);	 
-        	 }else{
-        		 wordSet.add(word.toString());
-        		 }
-         }
-         if (wordSet.isEmpty()){
-        	 return;
-         }else{
-        	 counter_final.increment(1);
-         }
-         List<String> wordList = new ArrayList<String>(wordSet);
-         
-         Collections.sort(wordList, new Comparator<String>() {
-        	 @Override
-        	 public int compare(String s1, String s2)
-        	 {
-        		 return  wordFreq.get(s1).compareTo(wordFreq.get(s2));
-        	 }
-         });
-         words.set(StringUtils.join(wordList,","));
-         context.write(new LongWritable(counter.getValue()), words);
-      }
-   }
-   public static class Reduce extends Reducer<LongWritable, Text, LongWritable, Text> {
-      @Override
-      public void reduce(LongWritable key, Iterable<Text> values, Context context)
-              throws IOException, InterruptedException {
-    	 Text words = new Text();
-    	 for (Text v : values){
-    		 words.set(v);
-    	 }
-         context.write(key, words);
-      }
-   }
-}
-- 
GitLab