diff --git a/InvertIndex1 b/InvertIndex1 deleted file mode 100644 index 91b9a1158228d59248b2207f9eeaf9eb000ec395..0000000000000000000000000000000000000000 --- a/InvertIndex1 +++ /dev/null @@ -1,166 +0,0 @@ -package ecp.BDPA.assignment1; -import java.io.BufferedReader; -import java.io.DataOutput; -import java.io.FileReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.security.KeyStore.LoadStoreParameter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; -public class InvertIndex extends Configured implements Tool { - - private enum ONLY_WORD_COUNTER { - PG100, - PG31100, - PG3200 - } - - public static void main(String[] args) throws Exception { - System.out.println(Arrays.toString(args)); - - Configuration conf = new Configuration(); - conf.set("StopWordsFileName", args[2]); - int res = ToolRunner.run(conf, new InvertIndex(), args); - - - System.exit(res); - } - @Override - public int run(String[] args) throws Exception { - System.out.println(Arrays.toString(args)); - Job job = new Job(getConf(), "InvertIndex"); - job.setJarByClass(InvertIndex.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Text.class); - job.setMapperClass(Map.class); - job.setReducerClass(Reduce.class); - //job.setCombinerClass(Reduce.class); - job.setNumReduceTasks(10); - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(Text.class); - - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, new Path(args[1])); - job.waitForCompletion(true); - - Integer i; - PrintWriter writer = new PrintWriter(args[3], "UTF-8"); - i = (int) job.getCounters().findCounter(ONLY_WORD_COUNTER.PG100).getValue(); - writer.println("PG100: "+i.toString()+"\n"); - i = (int) job.getCounters().findCounter(ONLY_WORD_COUNTER.PG31100).getValue(); - writer.println("PG31100: "+i.toString()+"\n"); - i = (int) job.getCounters().findCounter(ONLY_WORD_COUNTER.PG3200).getValue(); - writer.println("PG3200: "+i.toString()+"\n"); - writer.close(); - return 0; - } - - public static class Map extends Mapper<LongWritable, Text, Text, Text> { - private Text word = new Text(); - private Text filename = new Text(); - public List<Text> stopWords = new ArrayList<Text>(); - - public void loadStopWords(String filename) throws IOException{ - Path pt=new Path(filename);//Location of file in HDFS - FileSystem fs = FileSystem.get(new Configuration()); - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt))); - - String sCurrentLine; - - while ((sCurrentLine = br.readLine()) != null) { - String stopWord = sCurrentLine.replaceAll("[^A-Za-z]+", ""); - Text t = new Text(); - t.set(stopWord); - stopWords.add(t); - } - - br.close(); - - return; - } - - public void setup(Context context) throws IOException,InterruptedException { - super.setup(context); - String filename = context.getConfiguration().get("StopWordsFileName"); - loadStopWords(filename); - } - @Override - public void map(LongWritable key, Text value, Context context) - throws IOException, InterruptedException { - FileSplit fileSplit = (FileSplit)context.getInputSplit(); - String name = fileSplit.getPath().getName(); - filename.set(name); - for (String token: value.toString().split("\\s+|-{2,}+")) { - word.set(token.replaceAll("[^A-Za-z]+", "").toLowerCase()); - if (!stopWords.contains(word)){ - context.write(word, filename); - } - } - } - } - public static class Reduce extends Reducer<Text, Text, Text, Text> { - @Override - public void reduce(Text key, Iterable<Text> values, Context context) - throws IOException, InterruptedException { - ArrayList<Text> list = new ArrayList<Text>(); - - for (Text value : values) { - list.add(new Text(value)); - } - - if (list.size()==1){ - String filename = list.get(0).toString(); - switch (filename){ - case "pg100.txt": - context.getCounter(ONLY_WORD_COUNTER.PG100).increment(1); - break; - case "pg31100.txt": - context.getCounter(ONLY_WORD_COUNTER.PG31100).increment(1); - break; - case "pg3200.txt": - context.getCounter(ONLY_WORD_COUNTER.PG3200).increment(1); - break; - } - } - - Set<Text> set = new HashSet<Text>(list); - String output = new String(); - for (Text u : set){ - output += u.toString()+'#'+Collections.frequency(list, u); - output += ','; - } - output = output.substring(0, output.length()-1); - Text val = new Text(); - val.set(output); - context.write(key, val); - - } - } -}