Commit 4b2f399e authored by cloudera_vm's avatar cloudera_vm
Browse files

Clearing of Unique_words, not needed

parent bf10beba
This diff is collapsed.
......@@ -129,7 +129,6 @@ public class Preprocessing_1 extends Configured implements Tool {
for (String line : wordcount.split("\n")){
String[] word_count = line.split(",");
word_freq.put(word_count[0],new Integer(word_count[1]));
}
}
......
package Preprocessing;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import Preprocessing.Preprocessing_1.COUNTER;
public class Unique_words extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
int res = ToolRunner.run(new Configuration(), new Preprocessing_1(), args);
System.exit(res);
}
public static enum COUNTS {
COUNT_LINES
};
@Override
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = new Job(getConf(), "Preprocessing_1_test");
job.setJarByClass(Preprocessing_1.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.getConfiguration().set("mapreduce.output.textoutputformat.separator", ",");
job.setNumReduceTasks(1);
Path outputFilePath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, outputFilePath);
FileSystem fs = FileSystem.newInstance(getConf());
if (fs.exists(outputFilePath)) {
fs.delete(outputFilePath, true);
}
job.waitForCompletion(true);
long counts = job.getCounters().findCounter(COUNTS.COUNT_LINES).getValue();
Path outFile = new Path(outputFilePath,"nb_lines");
BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(
fs.create(outFile, true)));
writer.write(String.valueOf(counts));
writer.close();
return 0;
}
public static class Map extends Mapper<LongWritable, Text, LongWritable, Text> {
private Text word = new Text();
private String stopwords = new String();
public Map() throws IOException{
String stopwords_file = "/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords";
String stopwords = new String(Files.readAllBytes(Paths.get(stopwords_file)));
System.out.println(stopwords);
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
if (!value.toString().isEmpty()){
for (String token : value.toString().replaceAll("[^A-Za-z0-9]", " ").split("\\s+")) {
if (!stopwords.contains(token)) {
word.set(token);
context.write(key, word);
}
}
}
}
}
public static class Reduce extends Reducer<LongWritable,Text,LongWritable, HashSet<Text>> {
public HashSet<Text> words = new HashSet<Text>();
@Override
public void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text keyword : values) {
words.add(keyword);
}
context.write(key, words);
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment