Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mohammed Meftah
BDPA_Assign2_MMEFTAH
Commits
bf10beba
Commit
bf10beba
authored
Mar 18, 2017
by
cloudera_vm
Browse files
Preprocessing test on pg100_test (5 lines with 1 empty)
parent
b410dcf5
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Assign2/Preprocessing_1_test/.part-r-00000.crc
View file @
bf10beba
No preview for this file type
Assign2/Preprocessing_1_test/nb_output_records.txt
0 → 100644
View file @
bf10beba
4
\ No newline at end of file
Assign2/Preprocessing_1_test/part-r-00000
View file @
bf10beba
0,anyone anywhere ebook cost use
at no
78,restrictions whatsoever copy almost away give may
or no
14
8
,included license terms under re gutenberg project use
21
7
,online www org ebook gutenberg
at or
0,anyone anywhere ebook cost use
78,restrictions whatsoever copy almost away give may
14
9
,included license terms under re gutenberg project use
21
8
,online www org ebook gutenberg
Assign2/WordCount/.part-r-00000.crc
View file @
bf10beba
No preview for this file type
Assign2/WordCount/WordCount
View file @
bf10beba
...
...
@@ -910,7 +910,6 @@ amply,3
ampthill,1
amurath,2
amyntas,1
an,1896
anatomiz,2
anatomize,3
anatomy,4
...
...
@@ -1386,7 +1385,6 @@ astronomers,1
astronomical,1
astronomy,1
asunder,15
at,2536
atalanta,2
ate,3
ates,2
...
...
@@ -1514,7 +1512,6 @@ avails,2
avarice,2
avaricious,1
avaunt,15
ave,3
aveng,3
avenge,1
avenged,2
...
...
@@ -1566,7 +1563,6 @@ aye,15
ayez,1
azur,2
azure,1
b,16
ba,2
baa,1
babbl,1
...
...
@@ -5310,7 +5306,6 @@ cypriot,1
cyprus,28
cyrus,1
cytherea,3
d,8961
dabbled,1
dace,1
dad,3
...
...
@@ -6807,7 +6802,6 @@ dye,5
dyed,3
dyer,1
dying,48
e,142
each,240
eager,9
eagerly,3
...
...
@@ -7870,7 +7864,6 @@ eyestrings,1
eying,1
eyne,9
eyrie,1
f,11
fa,6
fabian,74
fable,4
...
...
@@ -8511,7 +8504,6 @@ flux,2
fluxive,1
fly,245
flying,17
fo,4
foal,1
foals,1
foam,4
...
...
@@ -9800,8 +9792,6 @@ gypsy,2
gyve,1
gyved,1
gyves,5
h,2
ha,230
haberdasher,5
habiliment,1
habiliments,4
...
...
@@ -9984,7 +9974,6 @@ hastily,5
hasting,2
hastings,149
hasty,21
hat,36
hatch,18
hatches,7
hatchet,1
...
...
@@ -10325,7 +10314,6 @@ hitting,2
hive,6
hives,1
hizzing,1
ho,209
hoa,5
hoar,7
hoard,4
...
...
@@ -10703,12 +10691,10 @@ ignorant,48
ii,171
iii,145
iiii,1
il,18
ilbow,1
ild,1
ilion,6
ilium,5
ill,279
illegitimate,2
illinois,222
illiterate,1
...
...
@@ -10727,7 +10713,6 @@ illustrious,5
illyria,13
illyrian,1
ils,2
im,1
image,46
imagery,1
images,11
...
...
@@ -11812,7 +11797,6 @@ knowledge,78
known,188
knows,213
kramer,1
l,23
la,78
laban,2
label,2
...
...
@@ -12376,7 +12360,6 @@ living,121
livings,1
lizard,2
lizards,2
ll,2409
llous,2
lnd,1
lo,74
...
...
@@ -12640,7 +12623,6 @@ lym,1
lymoges,2
lynn,1
lysander,103
m,30
ma,7
mab,3
macbeth,291
...
...
@@ -13833,7 +13815,6 @@ myself,567
myst,1
mysteries,4
mystery,17
n,159
nag,2
nage,1
nags,1
...
...
@@ -14066,7 +14047,6 @@ nit,2
nly,1
nnight,2
nnights,1
no,3814
noah,2
nob,2
nobility,37
...
...
@@ -14231,7 +14211,6 @@ ny,2
nym,63
nymph,9
nymphs,12
o,3053
oak,27
oaken,2
oaks,5
...
...
@@ -14473,7 +14452,6 @@ opprobriously,1
oppugnancy,1
opulency,1
opulent,2
or,3199
oracle,27
oracles,3
orange,5
...
...
@@ -14563,7 +14541,6 @@ oui,6
ounce,6
ounces,1
ouphes,2
our,3066
ours,88
ourself,24
ourselves,115
...
...
@@ -16778,7 +16755,6 @@ quoted,5
quotes,1
quoth,66
quotidian,2
r,92
rabbit,4
rabble,13
rabblement,2
...
...
@@ -17960,7 +17936,6 @@ ruttish,1
ry,60
rye,3
rything,1
s,7734
sa,6
saba,1
sabbath,2
...
...
@@ -20577,7 +20552,6 @@ syracusians,1
syria,6
syrups,2
system,1
t,1213
ta,96
taber,1
table,60
...
...
@@ -20865,7 +20839,6 @@ tetter,3
tevil,1
tewksbury,8
text,11
th,1177
thaes,1
thames,7
than,1885
...
...
@@ -21734,7 +21707,6 @@ tyrant,60
tyrants,10
tyrian,1
tyrrel,21
u,6
ubique,1
udders,1
udge,1
...
...
@@ -22597,7 +22569,6 @@ utterly,8
uttermost,7
utters,5
uy,1
v,99
va,1
vacancy,4
vacant,6
...
...
@@ -22713,7 +22684,6 @@ vaunts,2
vauvado,1
vaux,9
vaward,5
ve,1
veal,2
vede,1
vehemence,1
...
...
@@ -23047,7 +23017,6 @@ vulnerable,1
vulture,4
vultures,2
vurther,1
w,2
wad,1
waddled,1
wade,3
...
...
@@ -23492,7 +23461,6 @@ whoso,4
whosoe,2
whosoever,2
why,1476
wi,12
wick,1
wicked,64
wickednes,1
...
...
@@ -23605,7 +23573,6 @@ wishing,9
wishtly,1
wisp,1
wist,1
wit,269
witb,2
witch,94
witchcraft,18
...
...
@@ -23863,7 +23830,6 @@ xii,2
xiii,2
xiv,1
xv,1
y,51
yard,12
yards,5
yare,10
...
...
Assign2/hadoop.log
View file @
bf10beba
This diff is collapsed.
Click to expand it.
Assign2/pg100_test.txt
View file @
bf10beba
This eBook is for the use of anyone anywhere at no cost and with anyone cost
almost no restrictions whatsoever. You may copy it, give it away or
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
Assign2/pg100_test.txt~
View file @
bf10beba
This eBook is for the use of anyone anywhere at no cost and with
This eBook is for the use of anyone anywhere at no cost and with
anyone cost
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
Assign2/src/Preprocessing/Preprocessing_1.java
View file @
bf10beba
package
Preprocessing
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Arrays
;
import
java.util.Comparator
;
import
java.util.HashMap
;
import
java.util.Collections
;
import
java.util.LinkedHashSet
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.Map.Entry
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.IntWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
...
...
@@ -19,19 +33,8 @@ import org.apache.hadoop.util.ToolRunner;
import
java.io.*
;
import
java.util.*
;
public
class
Preprocessing_1
extends
Configured
implements
Tool
{
public
static
enum
COUNTER
{
COUNT_LINES
};
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
...
...
@@ -41,6 +44,8 @@ public class Preprocessing_1 extends Configured implements Tool {
System
.
exit
(
res
);
}
public
static
enum
COUNTS
{
COUNT_LINES
};
@Override
public
int
run
(
String
[]
args
)
throws
Exception
{
...
...
@@ -72,14 +77,14 @@ public class Preprocessing_1 extends Configured implements Tool {
job
.
waitForCompletion
(
true
);
// Write counter to file
long
counter
=
job
.
getCounters
().
findCounter
(
COUNT
ER
.
COUNT_LINES
).
getValue
();
Path
outFile
=
new
Path
(
new
Path
(
args
[
1
]),
"
NB_LINES_AFTER_Preprocessing
.txt"
);
BufferedWriter
writer
=
new
BufferedWriter
(
new
OutputStream
Writer
(
fs
.
create
(
outFile
,
true
)
));
writer
.
write
(
String
.
valueOf
(
counter
)
);
w
riter
.
close
();
long
counter
=
job
.
getCounters
().
findCounter
(
COUNT
S
.
COUNT_LINES
).
getValue
();
Path
c
ou
n
tFile
=
new
Path
(
new
Path
(
args
[
1
]),
"
nb_output_records
.txt"
);
File
file
=
new
File
(
countFile
.
toString
());
FileWriter
fileWriter
=
new
File
Writer
(
file
);
fileWriter
.
write
(
String
.
valueOf
(
counter
));
fileWriter
.
flush
(
);
fileW
riter
.
close
();
return
0
;
}
...
...
@@ -91,167 +96,92 @@ public class Preprocessing_1 extends Configured implements Tool {
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
LongWritable
,
Text
>
{
private
Text
word
=
new
Text
();
private
HashSet
<
String
>
stopwords
=
new
HashSet
<
String
>();
String
stopwords_file
=
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords"
;
String
stopwords
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
stopwords_file
)));
public
Map
()
throws
NumberFormatException
,
IOException
{
// Default constructor to load one time the stop words file
/* Read file of stopwords*/
BufferedReader
Reader
=
new
BufferedReader
(
new
FileReader
(
new
File
(
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords"
)));
/* Add each line (word) in the variable stopwords*/
String
pattern
;
while
((
pattern
=
Reader
.
readLine
())
!=
null
)
{
stopwords
.
add
(
pattern
.
toLowerCase
());
}
Reader
.
close
();
public
Map
()
throws
IOException
{
System
.
out
.
println
(
stopwords
);
}
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
for
(
String
token:
value
.
toString
().
replaceAll
(
"[^a-zA-Z0-9 ]"
,
" "
).
split
(
"\\s+"
))
{
/* if word not in stop words list then we set word with the value then write it into context */
if
(!
stopwords
.
contains
(
token
.
toLowerCase
()))
{
// if token only contains a blank character we do not write it
if
(!
stopwords
.
contains
(
token
.
toLowerCase
()))
{
word
.
set
(
token
.
toLowerCase
());
context
.
write
(
key
,
word
);
}
}
}
}
public
static
class
Reduce
extends
Reducer
<
LongWritable
,
Text
,
LongWritable
,
Text
>
{
/* Initialise one time a hashmap to store each word of the vocabulary and its global
* frequency in pg100.txt from the wordcountpg100.txt */
private
static
HashMap
<
String
,
Integer
>
map_word_count
=
new
HashMap
<
String
,
Integer
>();
private
static
HashMap
<
String
,
Integer
>
word_freq
=
new
HashMap
<
String
,
Integer
>();
public
Reduce
()
throws
NumberFormatException
,
IOException
{
/*Default constructor to store (word,frequency) pair
* in the created hashmap from the file wordcountpg100.txt */
public
Reduce
()
throws
IOException
{
BufferedReader
Reader_count
=
new
BufferedReader
(
new
FileReader
(
new
File
(
"/home/cloudera/workspace/bpa/Assign2/WordCount/WordCount"
)));
String
line
;
String
wordcount_file
=
"/home/cloudera/workspace/bpa/Assign2/WordCount/WordCount"
;
String
wordcount
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
wordcount_file
)));
while
((
line
=
Reader_count
.
readLine
())
!=
null
)
{
String
[]
parts
=
line
.
split
(
","
,
2
);
if
(
parts
.
length
>=
2
)
{
map_word_count
.
put
(
parts
[
0
].
toString
(),
new
Integer
(
parts
[
1
]));
}
else
{
System
.
out
.
println
(
"ignoring line: "
+
line
);
}
for
(
String
line
:
wordcount
.
split
(
"\n"
)){
String
[]
word_count
=
line
.
split
(
","
);
word_freq
.
put
(
word_count
[
0
],
new
Integer
(
word_count
[
1
]));
}
Reader_count
.
close
();
}
/*SOURCE : http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java
*/
public
static
<
K
,
V
extends
Comparable
<?
super
V
>>
LinkedHashSet
<
String
>
sortByValue
(
HashMap
<
K
,
V
>
map
){
List
<
java
.
util
.
Map
.
Entry
<
K
,
V
>>
list
=
new
LinkedList
<>(
map
.
entrySet
()
);
sortHM
(
HashMap
<
K
,
V
>
map
){
List
<
Entry
<
K
,
V
>>
list
=
new
LinkedList
<>(
map
.
entrySet
()
);
// sort the list of pairs
Collections
.
sort
(
list
,
new
Comparator
<
java
.
util
.
Map
.
Entry
<
K
,
V
>>()
Collections
.
sort
(
list
,
new
Comparator
<
Entry
<
K
,
V
>>()
{
public
int
compare
(
java
.
util
.
Map
.
Entry
<
K
,
V
>
o1
,
java
.
util
.
Map
.
Entry
<
K
,
V
>
o2
)
public
int
compare
(
Entry
<
K
,
V
>
o1
,
Entry
<
K
,
V
>
o2
)
{
return
(
o1
.
getValue
()).
compareTo
(
o2
.
getValue
());
}
}
);
// Create LinkedHashset to store the word in ascending order
LinkedHashSet
<
String
>
result
=
new
LinkedHashSet
<
String
>();
for
(
java
.
util
.
Map
.
Entry
<
K
,
V
>
entry
:
list
)
for
(
Entry
<
K
,
V
>
entry
:
list
)
{
result
.
add
(
entry
.
getKey
().
toString
());
}
return
result
;
}
@Override
public
void
reduce
(
LongWritable
key
,
Iterable
<
Text
>
values
,
Context
context
)
throws
IOException
,
InterruptedException
{
/*Create a reduced hashmap where each key is a word for the same
* mapper key and the value is the global frequency with the static hashmap
* word_word_count containing the global frequency of word in pg100.txt*/
HashMap
<
String
,
Integer
>
map_word_count_key
=
new
HashMap
<
String
,
Integer
>();
HashMap
<
String
,
Integer
>
line_word_count
=
new
HashMap
<
String
,
Integer
>();
for
(
Text
val
:
values
)
for
(
Text
token
:
values
)
{
/*store the global frequency of each word for words corresponding to a same key*/
map_word_count_key
.
put
(
val
.
toString
(),
map_word_count
.
get
(
val
.
toString
()));
line_word_count
.
put
(
token
.
toString
(),
word_freq
.
get
(
token
.
toString
()));
}
// Sort Hashmap and return a LinkedHashset (to keep the order) with word in ascending order
// Using the sortByValue method
LinkedHashSet
<
String
>
setvalue
=
new
LinkedHashSet
<
String
>();
setvalue
=
sortByValue
(
map_word_count_key
);
/* Concatenate the words in ascending order of frequency */
StringBuilder
reducedvalue
=
new
StringBuilder
();
for
(
String
val
:
setvalue
)
{
if
(
reducedvalue
.
length
()
!=
0
){
reducedvalue
.
append
(
' '
);
}
reducedvalue
.
append
(
val
);
StringBuilder
concat_words
=
new
StringBuilder
();
String
prefix
=
""
;
for
(
String
token
:
sortHM
(
line_word_count
))
{
concat_words
.
append
(
prefix
);
prefix
=
" "
;
concat_words
.
append
(
token
);
}
// write for each line the words in the ascending order if not empty
if
(!
reducedvalue
.
toString
().
isEmpty
()){
// Increment counter
context
.
getCounter
(
COUNTER
.
COUNT_LINES
).
increment
(
1
);
context
.
write
(
key
,
new
Text
(
reducedvalue
.
toString
()));
if
(!
concat_words
.
toString
().
isEmpty
()){
context
.
getCounter
(
COUNTS
.
COUNT_LINES
).
increment
(
1
);
context
.
write
(
key
,
new
Text
(
concat_words
.
toString
()));
}
}
...
...
Assign2/src/WordCount/WordCount.java
View file @
bf10beba
package
WordCount
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Arrays
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.IntWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.IntWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
import
org.apache.hadoop.mapreduce.Mapper
;
...
...
@@ -16,20 +22,6 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
import
Preprocessing.Preprocessing_1
;
import
Preprocessing.Preprocessing_1.COUNTER
;
import
Preprocessing.Preprocessing_1.Map
;
import
Preprocessing.Preprocessing_1.Reduce
;
import
java.io.BufferedReader
;
import
java.io.BufferedWriter
;
import
java.io.File
;
import
java.io.FileReader
;
import
java.io.IOException
;
import
java.io.OutputStreamWriter
;
import
java.util.Arrays
;
import
java.util.HashSet
;
public
class
WordCount
extends
Configured
implements
Tool
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
...
...
@@ -42,7 +34,7 @@ public class WordCount extends Configured implements Tool {
public
int
run
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
Job
job
=
new
Job
(
getConf
(),
"WordCount"
);
job
.
setJarByClass
(
Preprocessing_1
.
class
);
job
.
setJarByClass
(
WordCount
.
class
);
job
.
setOutputKeyClass
(
Text
.
class
);
job
.
setOutputValueClass
(
IntWritable
.
class
);
...
...
@@ -68,8 +60,6 @@ public class WordCount extends Configured implements Tool {
}
job
.
waitForCompletion
(
true
);
return
0
;
}
...
...
@@ -77,45 +67,20 @@ public class WordCount extends Configured implements Tool {
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
Text
,
IntWritable
>
{
private
final
static
IntWritable
ONE
=
new
IntWritable
(
1
);
private
Text
word
=
new
Text
();
private
String
stopwords_file
=
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords"
;
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
/* Initialize a hashset variable, set of strings without duplicates*/
HashSet
<
String
>
stopwords
=
new
HashSet
<
String
>();
/* Read file of stopwords*/
BufferedReader
Reader
=
new
BufferedReader
(
new
FileReader
(
new
File
(
"/home/cloudera/workspace/bpa/Assign2/stopwords/stopwords"
)));