Commit 9329fa10 authored by cloudera_vm's avatar cloudera_vm
Browse files

Efficient Method

parent efdea1c5
2833
\ No newline at end of file
1195--2228 1.0
1195--7082 1.0
1255--2286 0.9
1255--7142 1.0
1322--7209 1.0
1391--7278 1.0
1456--7343 1.0
1519--7406 1.0
1575--7462 1.0
1642--7529 1.0
2175--6998 0.8
2228--7082 1.0
2286--7142 0.9
This diff is collapsed.
package SetSimilarityJoins;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.google.common.collect.Sets;
public class Qb_Efficient_1000 extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
int res = ToolRunner.run(new Configuration(), new Qb_Efficient_1000(), args);
System.exit(res);
}
public static enum COUNTS {COUNT_COMP};
@Override
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = new Job(getConf(), "Qb_Efficient_1000");
job.setJarByClass(Qb_Efficient_1000.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator",",");
Path outputFilePath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, outputFilePath);
FileSystem fs = FileSystem.newInstance(getConf());
if (fs.exists(outputFilePath)) {
fs.delete(outputFilePath, true);
}
job.waitForCompletion(true);
long counter = job.getCounters().findCounter(COUNTS.COUNT_COMP).getValue();
Path countFile = new Path(new Path(args[1]),"nb_comp");
File file = new File(countFile.toString());
FileWriter fileWriter = new FileWriter(file);
fileWriter.write(String.valueOf(counter));
fileWriter.flush();
fileWriter.close();
return 0;
}
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000";
String doc = new String(Files.readAllBytes(Paths.get(doc_path)));
HashMap<String, String> id_doc = new HashMap<String, String>();
public Map() throws IOException{
for (String line : doc.split("\n")){
id_doc.put(line.split(",")[0],
line.split(",")[1]);
/*
*/
}
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String id_current = value.toString().split(",")[0];
String val_current = value.toString().split(",")[0];
int id_current_doc = Integer.valueOf(id_current);
int max_words = val_current.split(" ").length - (int) Math.ceil(0.8*val_current.split(" ").length) +1;
for (String other_doc : id_doc.keySet()) {
String other_doc_val = id_doc.get(other_doc);
int id_other_doc = Integer.valueOf(other_doc);
int m = 0;
for (String word : id_doc.get(id_current).split(" ")){
if (m>max_words){ break;}
if(other_doc_val.contains(word) && id_current_doc<id_other_doc){
StringBuilder pair = new StringBuilder();
pair.append(id_current_doc);
pair.append("--");
pair.append(id_other_doc);
context.write(new Text(pair.toString()),
new Text(
value.toString().split(",")[1].toLowerCase()));
break;
}
m++;
}
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable> {
String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000";
String doc = new String(Files.readAllBytes(Paths.get(doc_path)));
HashMap<String, String> id_doc = new HashMap<String, String>();
public Reduce() throws IOException{
for (String line : doc.split("\n")){
id_doc.put(line.split(",")[0],
line.split(",")[1]);
}
}
public static double Jaccard(String[] A, String[] B){
Set<String> A_set = new HashSet<String>(Arrays.asList(A));
Set<String> B_set = new HashSet<String>(Arrays.asList(B));
Set<String> union = Sets.union(A_set, B_set);
Set<String> intersection = Sets.intersection(A_set, B_set);
return (double)intersection.size()/(double)union.size();
}
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String[] ids = key.toString().split("--");
String content_1 = id_doc.get(ids[0]).toLowerCase();
String content_2 = id_doc.get(ids[1]).toLowerCase();
double jaccsim = Jaccard(content_1.split(" "),
content_2.split(" "));
if (jaccsim >=0.8){
context.write(key,new DoubleWritable(jaccsim));
}
context.getCounter(COUNTS.COUNT_COMP).increment(1);
}
}
}
package SetSimilarityJoins;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.google.common.collect.Sets;
public class Qb_invert_index extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
int res = ToolRunner.run(new Configuration(), new Qb_invert_index(), args);
System.exit(res);
}
public static enum COUNTS {COUNT_COMP};
@Override
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = new Job(getConf(), "Qb_Efficient_1000");
job.setJarByClass(Qb_invert_index.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator",",");
Path outputFilePath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, outputFilePath);
FileSystem fs = FileSystem.newInstance(getConf());
if (fs.exists(outputFilePath)) {
fs.delete(outputFilePath, true);
}
job.waitForCompletion(true);
long counter = job.getCounters().findCounter(COUNTS.COUNT_COMP).getValue();
Path countFile = new Path(new Path(args[1]),"nb_comp");
File file = new File(countFile.toString());
FileWriter fileWriter = new FileWriter(file);
fileWriter.write(String.valueOf(counter));
fileWriter.flush();
fileWriter.close();
return 0;
}
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000";
String doc = new String(Files.readAllBytes(Paths.get(doc_path)));
HashMap<String, String> id_doc = new HashMap<String, String>();
public Map() throws IOException{
for (String line : doc.split("\n")){
id_doc.put(line.split(",")[0],
line.split(",")[1]);
/*
*/
}
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
int id_current_doc = Integer.valueOf(
value.toString().split(",")[0]);
for (String other_doc : id_doc.keySet()) {
int id_other_doc = Integer.valueOf(other_doc);
if (id_current_doc < id_other_doc){
StringBuilder pair = new StringBuilder();
pair.append(id_current_doc);
pair.append("--");
pair.append(id_other_doc);
context.write(new Text(pair.toString()),
new Text(
value.toString().split(",")[1].toLowerCase()));
}
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable> {
String doc_path = "/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000";
String doc = new String(Files.readAllBytes(Paths.get(doc_path)));
HashMap<String, String> id_doc = new HashMap<String, String>();
public Reduce() throws IOException{
for (String line : doc.split("\n")){
id_doc.put(line.split(",")[0],
line.split(",")[1]);
}
}
public static double Jaccard(String[] A, String[] B){
Set<String> A_set = new HashSet<String>(Arrays.asList(A));
Set<String> B_set = new HashSet<String>(Arrays.asList(B));
Set<String> union = Sets.union(A_set, B_set);
Set<String> intersection = Sets.intersection(A_set, B_set);
return (double)intersection.size()/(double)union.size();
}
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String[] ids = key.toString().split("--");
String content_1 = id_doc.get(ids[0]).toLowerCase();
String content_2 = id_doc.get(ids[1]).toLowerCase();
double jaccsim = Jaccard(content_1.split(" "),
content_2.split(" "));
if (jaccsim >=0.8){
context.write(key,new DoubleWritable(jaccsim));
}
context.getCounter(COUNTS.COUNT_COMP).increment(1);
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment