Q3 count unique words...

98159e2b · cloudera_vm · 4590e294 · 98159e2b · 98159e2b · 98159e2b
Commit 98159e2b authored 8 years ago by cloudera_vm
--- a/Assign1/bin/.gitignore
+++ b/Assign1/bin/.gitignore
@@ -2,3 +2,4 @@
 /Question2/
 /StubDriver.class
 /StubMapper.class
+/Question3/
--- a/Assign1/hadoop.log
+++ b/Assign1/hadoop.log
--- a/Assign1/output_Q3/._SUCCESS.crc
+++ b/Assign1/output_Q3/._SUCCESS.crc
--- a/Assign1/output_Q3/.part-r-00000.crc
+++ b/Assign1/output_Q3/.part-r-00000.crc
--- a/Assign1/output_Q3/_SUCCESS
+++ b/Assign1/output_Q3/_SUCCESS
--- a/Assign1/output_Q3/part-r-00000
+++ b/Assign1/output_Q3/part-r-00000
--- a/Assign1/src/Question2/InvIndex.java
+++ b/Assign1/src/Question2/InvIndex.java
-package Question2;
-
-import java.util.Arrays;
-import java.util.StringTokenizer;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-public class InvIndex extends Configured implements Tool {
-	
-	public static void main(String[] args) throws Exception {
-	      System.out.println(Arrays.toString(args));
-	      int res = ToolRunner.run(new Configuration(), new InvIndex(), args);
-	      
-	      System.exit(res);
-	   }
-	
-	public int run(String[] args) throws Exception {
-
-		Job job = Job.getInstance(getConf());
-		job.setJobName("InvIndex");
-		job.setJarByClass(InvIndex.class);
-		
-		
-		job.setOutputKeyClass(Text.class);
-		job.setOutputValueClass(Text.class);
-
-		job.setMapperClass(Map.class);
-		job.setReducerClass(Reduce.class);
-
-		job.setInputFormatClass(TextInputFormat.class);
-		job.setOutputFormatClass(TextOutputFormat.class);
-		
-		Path outputFilePath = new Path(args[3]);
-		
-		FileInputFormat.addInputPath(job, new Path(args[0]));
-	    FileInputFormat.addInputPath(job, new Path(args[1]));
-	    FileInputFormat.addInputPath(job, new Path(args[2]));
-	    FileOutputFormat.setOutputPath(job, outputFilePath);
-	    
-
-		/* Delete output filepath if already exists */
-		FileSystem fs = FileSystem.newInstance(getConf());
-
-		if (fs.exists(outputFilePath)) {
-			fs.delete(outputFilePath, true);
-		}
-		
-		return job.waitForCompletion(true) ? 0 : 1;
-	}
-	
-	public static class Map extends Mapper<LongWritable, Text, Text, Text> {
-
-		private Text word = new Text();
-		private Text filename = new Text();
-
-		private boolean caseSensitive = false;
-
-		@Override
-		public void map(LongWritable key, Text value, Context context)
-		throws IOException, InterruptedException {
-		String filenameStr = ((FileSplit) context.getInputSplit()).getPath().getName();
-		filename = new Text(filenameStr);
-		
-		String line = value.toString();
-	
-		if (!caseSensitive) {
-			line = line.toLowerCase();
-		}
-	
-		StringTokenizer tokenizer = new StringTokenizer(line);
-		while (tokenizer.hasMoreTokens()) {
-			word.set(tokenizer.nextToken());			
-			context.write(word, filename);
-		}
-	}
-
-	@Override
-	protected void setup(Context context) throws IOException, InterruptedException {
-		Configuration conf = context.getConfiguration();
-		this.caseSensitive = conf.getBoolean("wordcount.case.sensitive",false);
-	}
-	}
-	
-	public static class Reduce extends Reducer<Text, Text, Text, Text> {
-
-		@Override
-		public void reduce(final Text key, final Iterable<Text> values,
-				final Context context) throws IOException, InterruptedException {
-
-			StringBuilder stringBuilder = new StringBuilder();
-
-			for (Text value : values) {
-				stringBuilder.append(value.toString());
-
-				if (values.iterator().hasNext()) {
-					stringBuilder.append(", ");
-				}
-			}
-
-			context.write(key, new Text(stringBuilder.toString()));
-		}
-
-	}
-	
-	
-}
--- a/Assign1/src/Question2/InvertedIndex.java
+++ b/Assign1/src/Question2/InvertedIndex.java
@@ -10,7 +10,6 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;

--- a/Assign1/src/Question3/InvertedIndex_Q3.java
+++ b/Assign1/src/Question3/InvertedIndex_Q3.java
+package Question3;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+public class InvertedIndex_Q3 extends Configured implements Tool {
+
+	public static void main(String[] args) throws Exception {
+	      System.out.println(Arrays.toString(args));
+	      int res = ToolRunner.run(new Configuration(), new InvertedIndex_Q3(), args);
+	      
+	      System.exit(res);
+	   }
+
+	  @Override
+	  public int run(String[] args) throws Exception {
+	      System.out.println(Arrays.toString(args));
+	      Job job = new Job(getConf(), "InvertedIndex_Q3");
+	      job.setJarByClass(InvertedIndex_Q3.class);
+	      job.setOutputKeyClass(Text.class);
+	      job.setOutputValueClass(Text.class);
+
+	      job.setMapperClass(Map.class);
+	      job.setReducerClass(Reduce.class);
+
+	      job.setInputFormatClass(TextInputFormat.class);
+	      job.setOutputFormatClass(TextOutputFormat.class);
+	      
+	      Path outputFilePath = new Path(args[3]);
+
+	      FileInputFormat.addInputPath(job, new Path(args[0]));
+	      FileInputFormat.addInputPath(job, new Path(args[1]));
+	      FileInputFormat.addInputPath(job, new Path(args[2]));
+	      FileOutputFormat.setOutputPath(job, outputFilePath);
+	      
+	      FileSystem fs = FileSystem.newInstance(getConf());
+
+			if (fs.exists(outputFilePath)) {
+				fs.delete(outputFilePath, true);
+			}
+
+	      job.waitForCompletion(true);
+	      
+	      return 0;
+	   }
+
+	  public static class Map extends Mapper<LongWritable, Text, Text, Text> {
+			private Text word = new Text();
+			private String stopwords_file = "/home/cloudera/workspace/bpa/Assign1/output_Q1.i/stopwords.csv";
+
+			@Override
+			public void map(LongWritable key, Text value, Context context)
+					throws IOException, InterruptedException {
+	        
+	    	  	String stopwords = new String(Files.readAllBytes(
+	    	  			Paths.get(stopwords_file)));
+
+				Text filename = new Text(
+						((FileSplit) context.getInputSplit())
+						.getPath().getName());
+
+				for (String token : value.toString().split("\\s+")) {
+					if (!stopwords.contains(token.toLowerCase())) {
+						word.set(token.toLowerCase());
+					}
+				}
+
+				context.write(word, filename);
+	      }
+			
+	  }
+	public static enum MyCount {
+		UNIQUE_WORDS,
+	};
+
+	public static class Reduce extends Reducer<Text, Text, Text, Text> {
+
+		@Override
+		public void reduce(Text key, Iterable<Text> values, Context context)
+				throws IOException, InterruptedException {
+
+			HashSet<String> set = new HashSet<String>();
+
+			for (Text value : values) {
+				set.add(value.toString());
+			}
+
+			if (set.size() == 1) {
+
+				context.getCounter(MyCount.UNIQUE_WORDS).increment(1);
+
+				StringBuilder builder = new StringBuilder();
+
+				String prefix = "";
+				for (String value : set) {
+					builder.append(prefix);
+					prefix = ", ";
+					builder.append(value);
+				}
+
+				context.write(key, new Text(builder.toString()));
+
+			}
+		}
+	}
+}
\ No newline at end of file