Commit 121fae99 authored by cloudera_vm's avatar cloudera_vm
Browse files

All changes we take into account capital letters

parent d47d6c7c
114969
\ No newline at end of file
115005
\ No newline at end of file
This diff is collapsed.
......@@ -7555,6 +7555,7 @@ Syracuse,9
Syracusian,7
Syracusians,1
Syria,6
T,32
TAILOR,14
TALBOT,81
TALE,1
......@@ -7703,6 +7704,7 @@ Terrible,1
Tester,1
Tetchy,1
Tewksbury,8
Th,97
Thames,7
Than,426
Thane,28
......@@ -7713,6 +7715,7 @@ Thasos,1
That,2984
Thaw,1
Thawing,1
The,4041
Theatre,1
Theban,1
Thebes,1
......@@ -17597,6 +17600,7 @@ hills,10
hilt,4
hilts,7
hily,1
him,5175
himself,440
hinc,1
hind,17
......@@ -17967,6 +17971,7 @@ illustrate,2
illustrated,1
illustrious,5
ils,2
im,1
image,45
imagery,1
images,11
......@@ -25475,6 +25480,7 @@ snowed,1
snowy,1
snuff,9
snuffs,1
so,4024
soak,1
soaking,2
soaks,1
......
This diff is collapsed.
......@@ -98,10 +98,10 @@ public class Preprocessing_1 extends Configured implements Tool {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String token: value.toString().replaceAll("[^a-zA-Z0-9 ]", " ").split("\\s+")) {
for (String token: value.toString().replaceAll("[^A-Za-z0-9 ]", " ").split("\\s+")) {
if (!stopwords.contains(token.toLowerCase())) {
word.set(token.toLowerCase());
if (!stopwords.contains(token)) {
word.set(token);
context.write(key, word);
}
}
......
......@@ -2,12 +2,14 @@ package Preprocessing;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
......@@ -67,13 +69,13 @@ public class stopwords extends Configured implements Tool {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String token: value.toString().split("\\s+")) {
word.set(token.replaceAll("[^A-Za-z0-9]", ""));
word.set(token.replaceAll("[^A-Za-z0-9 ]", " "));
context.write(word, ONE);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public static class Reduce extends Reducer<Text, IntWritable, Text, NullWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
......@@ -82,7 +84,7 @@ public class stopwords extends Configured implements Tool {
sum += val.get();
}
if (sum > 4000) {
context.write(key, new IntWritable(0));
context.write(key, NullWritable.get());
}
}
}
......
......@@ -77,7 +77,7 @@ public class WordCount extends Configured implements Tool {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String token: value.toString().replaceAll("[^a-zA-Z0-9 ]", " ").split("\\s+")) {
for (String token: value.toString().replaceAll("[^A-Za-z0-9 ]", " ").split("\\s+")) {
if (!stopwords.contains(token)) {
word.set(token);
context.write(word, ONE);
......
0
And 0
I 0
The 0
a 0
and 0
as 0
be 0
for 0
have 0
he 0
him 0
his 0
in 0
is 0
it 0
me 0
my 0
not 0
of 0
so 0
that 0
the 0
this 0
thou 0
to 0
will 0
with 0
you 0
your 0
And
I
a
and
as
be
for
have
he
his
in
is
it
me
my
not
of
that
the
this
thou
to
will
with
you
your
And
I
The
a
and
as
......@@ -8,7 +7,6 @@ be
for
have
he
him
his
in
is
......@@ -17,7 +15,6 @@ me
my
not
of
so
that
the
this
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment