Commit bf10beba authored by cloudera_vm's avatar cloudera_vm
Browse files

Preprocessing test on pg100_test (5 lines with 1 empty)

parent b410dcf5
0,anyone anywhere ebook cost use at no
78,restrictions whatsoever copy almost away give may or no
148,included license terms under re gutenberg project use
217,online www org ebook gutenberg at or
0,anyone anywhere ebook cost use
78,restrictions whatsoever copy almost away give may
149,included license terms under re gutenberg project use
218,online www org ebook gutenberg
......@@ -910,7 +910,6 @@ amply,3
ampthill,1
amurath,2
amyntas,1
an,1896
anatomiz,2
anatomize,3
anatomy,4
......@@ -1386,7 +1385,6 @@ astronomers,1
astronomical,1
astronomy,1
asunder,15
at,2536
atalanta,2
ate,3
ates,2
......@@ -1514,7 +1512,6 @@ avails,2
avarice,2
avaricious,1
avaunt,15
ave,3
aveng,3
avenge,1
avenged,2
......@@ -1566,7 +1563,6 @@ aye,15
ayez,1
azur,2
azure,1
b,16
ba,2
baa,1
babbl,1
......@@ -5310,7 +5306,6 @@ cypriot,1
cyprus,28
cyrus,1
cytherea,3
d,8961
dabbled,1
dace,1
dad,3
......@@ -6807,7 +6802,6 @@ dye,5
dyed,3
dyer,1
dying,48
e,142
each,240
eager,9
eagerly,3
......@@ -7870,7 +7864,6 @@ eyestrings,1
eying,1
eyne,9
eyrie,1
f,11
fa,6
fabian,74
fable,4
......@@ -8511,7 +8504,6 @@ flux,2
fluxive,1
fly,245
flying,17
fo,4
foal,1
foals,1
foam,4
......@@ -9800,8 +9792,6 @@ gypsy,2
gyve,1
gyved,1
gyves,5
h,2
ha,230
haberdasher,5
habiliment,1
habiliments,4
......@@ -9984,7 +9974,6 @@ hastily,5
hasting,2
hastings,149
hasty,21
hat,36
hatch,18
hatches,7
hatchet,1
......@@ -10325,7 +10314,6 @@ hitting,2
hive,6
hives,1
hizzing,1
ho,209
hoa,5
hoar,7
hoard,4
......@@ -10703,12 +10691,10 @@ ignorant,48
ii,171
iii,145
iiii,1
il,18
ilbow,1
ild,1
ilion,6
ilium,5
ill,279
illegitimate,2
illinois,222
illiterate,1
......@@ -10727,7 +10713,6 @@ illustrious,5
illyria,13
illyrian,1
ils,2
im,1
image,46
imagery,1
images,11
......@@ -11812,7 +11797,6 @@ knowledge,78
known,188
knows,213
kramer,1
l,23
la,78
laban,2
label,2
......@@ -12376,7 +12360,6 @@ living,121
livings,1
lizard,2
lizards,2
ll,2409
llous,2
lnd,1
lo,74
......@@ -12640,7 +12623,6 @@ lym,1
lymoges,2
lynn,1
lysander,103
m,30
ma,7
mab,3
macbeth,291
......@@ -13833,7 +13815,6 @@ myself,567
myst,1
mysteries,4
mystery,17
n,159
nag,2
nage,1
nags,1
......@@ -14066,7 +14047,6 @@ nit,2
nly,1
nnight,2
nnights,1
no,3814
noah,2
nob,2
nobility,37
......@@ -14231,7 +14211,6 @@ ny,2
nym,63
nymph,9
nymphs,12
o,3053
oak,27
oaken,2
oaks,5
......@@ -14473,7 +14452,6 @@ opprobriously,1
oppugnancy,1
opulency,1
opulent,2
or,3199
oracle,27
oracles,3
orange,5
......@@ -14563,7 +14541,6 @@ oui,6
ounce,6
ounces,1
ouphes,2
our,3066
ours,88
ourself,24
ourselves,115
......@@ -16778,7 +16755,6 @@ quoted,5
quotes,1
quoth,66
quotidian,2
r,92
rabbit,4
rabble,13
rabblement,2
......@@ -17960,7 +17936,6 @@ ruttish,1
ry,60
rye,3
rything,1
s,7734
sa,6
saba,1
sabbath,2
......@@ -20577,7 +20552,6 @@ syracusians,1
syria,6
syrups,2
system,1
t,1213
ta,96
taber,1
table,60
......@@ -20865,7 +20839,6 @@ tetter,3
tevil,1
tewksbury,8
text,11
th,1177
thaes,1
thames,7
than,1885
......@@ -21734,7 +21707,6 @@ tyrant,60
tyrants,10
tyrian,1
tyrrel,21
u,6
ubique,1
udders,1
udge,1
......@@ -22597,7 +22569,6 @@ utterly,8
uttermost,7
utters,5
uy,1
v,99
va,1
vacancy,4
vacant,6
......@@ -22713,7 +22684,6 @@ vaunts,2
vauvado,1
vaux,9
vaward,5
ve,1
veal,2
vede,1
vehemence,1
......@@ -23047,7 +23017,6 @@ vulnerable,1
vulture,4
vultures,2
vurther,1
w,2
wad,1
waddled,1
wade,3
......@@ -23492,7 +23461,6 @@ whoso,4
whosoe,2
whosoever,2
why,1476
wi,12
wick,1
wicked,64
wickednes,1
......@@ -23605,7 +23573,6 @@ wishing,9
wishtly,1
wisp,1
wist,1
wit,269
witb,2
witch,94
witchcraft,18
......@@ -23863,7 +23830,6 @@ xii,2
xiii,2
xiv,1
xv,1
y,51
yard,12
yards,5
yare,10
......
This diff is collapsed.
This eBook is for the use of anyone anywhere at no cost and with anyone cost
almost no restrictions whatsoever. You may copy it, give it away or
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
This eBook is for the use of anyone anywhere at no cost and with
This eBook is for the use of anyone anywhere at no cost and with anyone cost
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
package Preprocessing;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
......@@ -19,19 +33,8 @@ import org.apache.hadoop.util.ToolRunner;
import java.io.*;
import java.util.*;
public class Preprocessing_1 extends Configured implements Tool {
public static enum COUNTER {
COUNT_LINES
};
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
......@@ -41,6 +44,8 @@ public class Preprocessing_1 extends Configured implements Tool {
System.exit(res);
}
public static enum COUNTS {COUNT_LINES};
@Override
public int run(String[] args) throws Exception {
......@@ -72,14 +77,14 @@ public class Preprocessing_1 extends Configured implements Tool {
job.waitForCompletion(true);
// Write counter to file
long counter = job.getCounters().findCounter(COUNTER.COUNT_LINES).getValue();
Path outFile = new Path(new Path(args[1]),"NB_LINES_AFTER_Preprocessing.txt");
BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(
fs.create(outFile, true)));
writer.write(String.valueOf(counter));
writer.close();
long counter = job.getCounters().findCounter(COUNTS.COUNT_LINES).getValue();
Path countFile = new Path(new Path(args[1]),"nb_output_records.txt");
File file = new File(countFile.toString());
FileWriter fileWriter = new FileWriter(file);
fileWriter.write(String.valueOf(counter));
fileWriter.flush();
fileWriter.close();
return 0;
}
......@@ -91,167 +96,92 @@ public class Preprocessing_1 extends Configured implements Tool {
public static class Map extends Mapper<LongWritable, Text, LongWritable, Text> {
private Text word = new Text();
private HashSet<String> stopwords = new HashSet<String>();
String stopwords_file = "/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords";
String stopwords = new String(Files.readAllBytes(Paths.get(stopwords_file)));
public Map() throws NumberFormatException, IOException{
// Default constructor to load one time the stop words file
/* Read file of stopwords*/
BufferedReader Reader = new BufferedReader(
new FileReader(
new File(
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords")));
/* Add each line (word) in the variable stopwords*/
String pattern;
while ((pattern = Reader.readLine()) != null) {
stopwords.add(pattern.toLowerCase());
}
Reader.close();
public Map() throws IOException{
System.out.println(stopwords);
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String token: value.toString().replaceAll("[^a-zA-Z0-9 ]", " ").split("\\s+")) {
/* if word not in stop words list then we set word with the value then write it into context */
if (!stopwords.contains(token.toLowerCase())) {
// if token only contains a blank character we do not write it
if (!stopwords.contains(token.toLowerCase())) {
word.set(token.toLowerCase());
context.write(key, word);
}
}
}
}
public static class Reduce extends Reducer<LongWritable, Text, LongWritable, Text> {
/* Initialise one time a hashmap to store each word of the vocabulary and its global
* frequency in pg100.txt from the wordcountpg100.txt */
private static HashMap<String,Integer> map_word_count = new HashMap<String,Integer>();
private static HashMap<String,Integer> word_freq = new HashMap<String,Integer>();
public Reduce() throws NumberFormatException, IOException{
/*Default constructor to store (word,frequency) pair
* in the created hashmap from the file wordcountpg100.txt */
public Reduce() throws IOException{
BufferedReader Reader_count = new BufferedReader(
new FileReader(
new File(
"/home/cloudera/workspace/bpa/Assign2/WordCount/WordCount"
)));
String line;
String wordcount_file = "/home/cloudera/workspace/bpa/Assign2/WordCount/WordCount";
String wordcount = new String(Files.readAllBytes(
Paths.get(wordcount_file)));
while ((line = Reader_count.readLine()) != null)
{
String[] parts = line.split(",", 2);
if (parts.length >= 2)
{
map_word_count.put(parts[0].toString(),new Integer (parts[1]));
} else {
System.out.println("ignoring line: " + line);
}
for (String line : wordcount.split("\n")){
String[] word_count = line.split(",");
word_freq.put(word_count[0],new Integer(word_count[1]));
}
Reader_count.close();
}
/*SOURCE : http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java
*/
public static <K, V extends Comparable<? super V>> LinkedHashSet<String>
sortByValue( HashMap<K, V> map ){
List<java.util.Map.Entry<K, V>> list = new LinkedList<>( map.entrySet() );
sortHM( HashMap<K, V> map ){
List<Entry<K, V>> list =
new LinkedList<>( map.entrySet() );
// sort the list of pairs
Collections.sort( list, new Comparator<java.util.Map.Entry<K, V>>()
Collections.sort( list, new Comparator<Entry<K, V>>()
{
public int compare( java.util.Map.Entry<K, V> o1, java.util.Map.Entry<K, V> o2 )
public int compare(Entry<K, V> o1, Entry<K, V> o2 )
{
return (o1.getValue()).compareTo(o2.getValue());
}
} );
// Create LinkedHashset to store the word in ascending order
LinkedHashSet<String> result = new LinkedHashSet<String>();
for (java.util.Map.Entry<K, V> entry : list)
for (Entry<K, V> entry : list)
{
result.add(entry.getKey().toString());
}
return result;
}
@Override
public void reduce(LongWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
/*Create a reduced hashmap where each key is a word for the same
* mapper key and the value is the global frequency with the static hashmap
* word_word_count containing the global frequency of word in pg100.txt*/
HashMap<String, Integer> map_word_count_key = new HashMap<String, Integer>();
HashMap<String, Integer> line_word_count = new HashMap<String, Integer>();
for (Text val : values)
for (Text token : values)
{
/*store the global frequency of each word for words corresponding to a same key*/
map_word_count_key.put(val.toString(),map_word_count.get(val.toString()));
line_word_count.put(token.toString(),
word_freq.get(token.toString()));
}
// Sort Hashmap and return a LinkedHashset (to keep the order) with word in ascending order
// Using the sortByValue method
LinkedHashSet<String> setvalue = new LinkedHashSet<String>();
setvalue = sortByValue(map_word_count_key);
/* Concatenate the words in ascending order of frequency */
StringBuilder reducedvalue = new StringBuilder();
for (String val : setvalue) {
if (reducedvalue.length() !=0){
reducedvalue.append(' ');
}
reducedvalue.append(val);
StringBuilder concat_words = new StringBuilder();
String prefix = "";
for (String token : sortHM(line_word_count)) {
concat_words.append(prefix);
prefix = " ";
concat_words.append(token);
}
// write for each line the words in the ascending order if not empty
if(!reducedvalue.toString().isEmpty()){
// Increment counter
context.getCounter(COUNTER.COUNT_LINES).increment(1);
context.write(key, new Text(reducedvalue.toString()));
if(!concat_words.toString().isEmpty()){
context.getCounter(COUNTS.COUNT_LINES).increment(1);
context.write(key, new Text(concat_words.toString()));
}
}
......
package WordCount;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
......@@ -16,20 +22,6 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import Preprocessing.Preprocessing_1;
import Preprocessing.Preprocessing_1.COUNTER;
import Preprocessing.Preprocessing_1.Map;
import Preprocessing.Preprocessing_1.Reduce;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.HashSet;
public class WordCount extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
......@@ -42,7 +34,7 @@ public class WordCount extends Configured implements Tool {
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = new Job(getConf(), "WordCount");
job.setJarByClass(Preprocessing_1.class);
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
......@@ -68,8 +60,6 @@ public class WordCount extends Configured implements Tool {
}
job.waitForCompletion(true);
return 0;
}
......@@ -77,45 +67,20 @@ public class WordCount extends Configured implements Tool {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable ONE = new IntWritable(1);
private Text word = new Text();
private String stopwords_file = "/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords";
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
/* Initialize a hashset variable, set of strings without duplicates*/
HashSet<String> stopwords = new HashSet<String>();
/* Read file of stopwords*/
BufferedReader Reader = new BufferedReader(
new FileReader(
new File(
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords")));