diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..397b4a7624e35fa60563a9c03b1213d93f7b6546 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/Assign2/Preprocessing_1_test/._SUCCESS.crc b/Assign2/Preprocessing_1_test/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 Binary files a/Assign2/Preprocessing_1_test/._SUCCESS.crc and /dev/null differ diff --git a/Assign2/Preprocessing_1_test/.part-r-00000.crc b/Assign2/Preprocessing_1_test/.part-r-00000.crc deleted file mode 100644 index a82a4326b15ab48d1271a57d45ab704cfe992207..0000000000000000000000000000000000000000 Binary files a/Assign2/Preprocessing_1_test/.part-r-00000.crc and /dev/null differ diff --git a/Assign2/Preprocessing_1_test/_SUCCESS b/Assign2/Preprocessing_1_test/_SUCCESS deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/Assign2/Preprocessing_1_test/nb_output_records.txt b/Assign2/Preprocessing_1_test/nb_output_records.txt deleted file mode 100644 index bf0d87ab1b2b0ec1a11a3973d2845b42413d9767..0000000000000000000000000000000000000000 --- a/Assign2/Preprocessing_1_test/nb_output_records.txt +++ /dev/null @@ -1 +0,0 @@ -4 \ No newline at end of file diff --git a/Assign2/Preprocessing_1_test/part-r-00000 b/Assign2/Preprocessing_1_test/part-r-00000 deleted file mode 100644 index 96f5477bcdb5d053e2900575a5e1ff51e2c1ebd1..0000000000000000000000000000000000000000 --- a/Assign2/Preprocessing_1_test/part-r-00000 +++ /dev/null @@ -1,4 +0,0 @@ -0,anyone anywhere ebook cost use -78,restrictions whatsoever copy almost away give may -149,included license terms under re gutenberg project use -218,online www org ebook gutenberg diff --git a/Assign2/src/SetSimilarityJoins/Qa_ALL_PAIRS.java b/Assign2/src/SetSimilarityJoins/Qa_ALL_PAIRS.java deleted file mode 100644 index 746cc760460e031fb57dc1d9b96051af7235ec53..0000000000000000000000000000000000000000 --- a/Assign2/src/SetSimilarityJoins/Qa_ALL_PAIRS.java +++ /dev/null @@ -1,181 +0,0 @@ -package SetSimilarityJoins; - - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Collections; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -import com.google.common.collect.Sets; - - - -public class Qa_ALL_PAIRS extends Configured implements Tool { - - public static void main(String[] args) throws Exception { - - System.out.println(Arrays.toString(args)); - int res = ToolRunner.run(new Configuration(), new Qa_ALL_PAIRS(), args); - System.exit(res); - } - - public static enum COUNTS {COUNT_COMP}; - - @Override - public int run(String[] args) throws Exception { - System.out.println(Arrays.toString(args)); - Job job = new Job(getConf(), "Qa_ALL_PAIRS"); - job.setJarByClass(Qa_ALL_PAIRS.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Text.class); - - job.setMapperClass(Map.class); - job.setReducerClass(Reduce.class); - - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - - job.getConfiguration().set("mapreduce.output.textoutputformat.separator", ","); - - job.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator",","); - - - Path outputFilePath = new Path(args[1]); - - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, outputFilePath); - - FileSystem fs = FileSystem.newInstance(getConf()); - - if (fs.exists(outputFilePath)) { - fs.delete(outputFilePath, true); - } - - job.waitForCompletion(true); - - long counter = job.getCounters().findCounter(COUNTS.COUNT_COMP).getValue(); - Path countFile = new Path(new Path(args[1]),"nb_comp"); - File file = new File(countFile.toString()); - FileWriter fileWriter = new FileWriter(file); - fileWriter.write(String.valueOf(counter)); - fileWriter.flush(); - fileWriter.close(); - - return 0; - } - - public static class Map extends Mapper<LongWritable, Text, Text, Text> { - - String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc"; - String doc = new String(Files.readAllBytes(Paths.get(doc_path))); - HashMap<String, String> id_doc = new HashMap<String, String>(); - - public Map() throws IOException{ - for (String line : doc.split("\n")){ - - id_doc.put(line.split(",")[0], - line.split(",")[1]); - /* - - */ - } - } - @Override - public void map(LongWritable key, Text value, Context context) - throws IOException, InterruptedException { - - - int id_current_doc = Integer.valueOf( - value.toString().split(",")[0]); - - - for (String other_doc : id_doc.keySet()) { - - int id_other_doc = Integer.valueOf(other_doc); - - if (id_current_doc < id_other_doc){ - StringBuilder pair = new StringBuilder(); - pair.append(id_current_doc); - pair.append("--"); - pair.append(id_other_doc); - context.write(new Text(pair.toString()), - new Text( - value.toString().split(",")[1].toLowerCase())); - } - } - } - } - - public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable> { - - - String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc"; - String doc = new String(Files.readAllBytes(Paths.get(doc_path))); - HashMap<String, String> id_doc = new HashMap<String, String>(); - - public Reduce() throws IOException{ - for (String line : doc.split("\n")){ - id_doc.put(line.split(",")[0], - line.split(",")[1]); - } - } - - public static double Jaccard(String[] A, String[] B){ - Set<String> A_set = new HashSet<String>(Arrays.asList(A)); - Set<String> B_set = new HashSet<String>(Arrays.asList(B)); - Set<String> union = Sets.union(A_set, B_set); - Set<String> intersection = Sets.intersection(A_set, B_set); - return (double)intersection.size()/(double)union.size(); - } - - @Override - public void reduce(Text key, Iterable<Text> values, Context context) - throws IOException, InterruptedException { - - String[] ids = key.toString().split("--"); - String content_1 = id_doc.get(ids[0]).toLowerCase(); - String content_2 = id_doc.get(ids[1]).toLowerCase(); - - - double jaccsim = Jaccard(content_1.split(" "), - content_2.split(" ")); - if (jaccsim >=0.8){ - context.write(key,new DoubleWritable(jaccsim)); - } - context.getCounter(COUNTS.COUNT_COMP).increment(1); - } - } - -} - diff --git a/Assign2/src/SetSimilarityJoins/Qb_invert_index.java b/Assign2/src/SetSimilarityJoins/Qb_invert_index.java deleted file mode 100644 index cc64b91545486ae52d2d4c96c9ed6bee68bff894..0000000000000000000000000000000000000000 --- a/Assign2/src/SetSimilarityJoins/Qb_invert_index.java +++ /dev/null @@ -1,179 +0,0 @@ -package SetSimilarityJoins; - - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Collections; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -import com.google.common.collect.Sets; - - - -public class Qb_invert_index extends Configured implements Tool { - - public static void main(String[] args) throws Exception { - - System.out.println(Arrays.toString(args)); - int res = ToolRunner.run(new Configuration(), new Qb_invert_index(), args); - System.exit(res); - } - - public static enum COUNTS {COUNT_COMP}; - - @Override - public int run(String[] args) throws Exception { - System.out.println(Arrays.toString(args)); - Job job = new Job(getConf(), "Qb_Efficient_1000"); - job.setJarByClass(Qb_invert_index.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Text.class); - - job.setMapperClass(Map.class); - job.setReducerClass(Reduce.class); - - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - - job.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator",","); - - - Path outputFilePath = new Path(args[1]); - - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, outputFilePath); - - FileSystem fs = FileSystem.newInstance(getConf()); - - if (fs.exists(outputFilePath)) { - fs.delete(outputFilePath, true); - } - - job.waitForCompletion(true); - - long counter = job.getCounters().findCounter(COUNTS.COUNT_COMP).getValue(); - Path countFile = new Path(new Path(args[1]),"nb_comp"); - File file = new File(countFile.toString()); - FileWriter fileWriter = new FileWriter(file); - fileWriter.write(String.valueOf(counter)); - fileWriter.flush(); - fileWriter.close(); - - return 0; - } - - public static class Map extends Mapper<LongWritable, Text, Text, Text> { - - String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"; - String doc = new String(Files.readAllBytes(Paths.get(doc_path))); - HashMap<String, String> id_doc = new HashMap<String, String>(); - - public Map() throws IOException{ - for (String line : doc.split("\n")){ - - id_doc.put(line.split(",")[0], - line.split(",")[1]); - /* - - */ - } - } - @Override - public void map(LongWritable key, Text value, Context context) - throws IOException, InterruptedException { - - - int id_current_doc = Integer.valueOf( - value.toString().split(",")[0]); - - - for (String other_doc : id_doc.keySet()) { - - int id_other_doc = Integer.valueOf(other_doc); - - if (id_current_doc < id_other_doc){ - StringBuilder pair = new StringBuilder(); - pair.append(id_current_doc); - pair.append("--"); - pair.append(id_other_doc); - context.write(new Text(pair.toString()), - new Text( - value.toString().split(",")[1].toLowerCase())); - } - } - } - } - - public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable> { - - - String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"; - String doc = new String(Files.readAllBytes(Paths.get(doc_path))); - HashMap<String, String> id_doc = new HashMap<String, String>(); - - public Reduce() throws IOException{ - for (String line : doc.split("\n")){ - id_doc.put(line.split(",")[0], - line.split(",")[1]); - } - } - - public static double Jaccard(String[] A, String[] B){ - Set<String> A_set = new HashSet<String>(Arrays.asList(A)); - Set<String> B_set = new HashSet<String>(Arrays.asList(B)); - Set<String> union = Sets.union(A_set, B_set); - Set<String> intersection = Sets.intersection(A_set, B_set); - return (double)intersection.size()/(double)union.size(); - } - - @Override - public void reduce(Text key, Iterable<Text> values, Context context) - throws IOException, InterruptedException { - - String[] ids = key.toString().split("--"); - String content_1 = id_doc.get(ids[0]).toLowerCase(); - String content_2 = id_doc.get(ids[1]).toLowerCase(); - - - double jaccsim = Jaccard(content_1.split(" "), - content_2.split(" ")); - if (jaccsim >=0.8){ - context.write(key,new DoubleWritable(jaccsim)); - } - context.getCounter(COUNTS.COUNT_COMP).increment(1); - } - } - -} - diff --git a/Assign2/stopwords.csv b/Assign2/stopwords.csv deleted file mode 100644 index ba2294be7a1f2f8e5e25c4c581457e93f2b39a46..0000000000000000000000000000000000000000 --- a/Assign2/stopwords.csv +++ /dev/null @@ -1,118 +0,0 @@ -about -be -before -by -her -mr. -much -old -up -where -you -after -been -come -down -get -got -have -is -me -my -now -only -she -their -them -this -upon -we -will -but -do -great -had -it -like -most -no -on -take -the -then -thy -time -was -which -are -as -first -into -one -our -shall -they -think -us -all -at -of -that -your -go -his -how -make -never -out -very -when -who -with -could -did -every -good -he -i -in -it, -let -more -or -such -were -would -am -can -from -has -its -little -man -other -so -some -there -two -what -a -an -and -for -if -it. -know -made -may -must -not -said -see -should -than -these -to -any -him -thou