Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mohammed Meftah
BDPA_Assign2_MMEFTAH
Commits
98159e2b
Commit
98159e2b
authored
Feb 19, 2017
by
cloudera_vm
Browse files
Q3 count unique words...
parent
4590e294
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Assign1/bin/.gitignore
View file @
98159e2b
...
...
@@ -2,3 +2,4 @@
/Question2/
/StubDriver.class
/StubMapper.class
/Question3/
Assign1/hadoop.log
View file @
98159e2b
This diff is collapsed.
Click to expand it.
Assign1/output_Q3/._SUCCESS.crc
0 → 100644
View file @
98159e2b
File added
Assign1/output_Q3/.part-r-00000.crc
0 → 100644
View file @
98159e2b
File added
Assign1/output_Q3/_SUCCESS
0 → 100644
View file @
98159e2b
Assign1/output_Q3/part-r-00000
0 → 100644
View file @
98159e2b
This diff is collapsed.
Click to expand it.
Assign1/src/Question2/InvIndex.java
deleted
100644 → 0
View file @
4590e294
package
Question2
;
import
java.util.Arrays
;
import
java.util.StringTokenizer
;
import
java.io.IOException
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.mapreduce.Job
;
import
org.apache.hadoop.mapreduce.Mapper
;
import
org.apache.hadoop.mapreduce.Reducer
;
import
org.apache.hadoop.mapreduce.lib.input.FileSplit
;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat
;
import
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
public
class
InvIndex
extends
Configured
implements
Tool
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
int
res
=
ToolRunner
.
run
(
new
Configuration
(),
new
InvIndex
(),
args
);
System
.
exit
(
res
);
}
public
int
run
(
String
[]
args
)
throws
Exception
{
Job
job
=
Job
.
getInstance
(
getConf
());
job
.
setJobName
(
"InvIndex"
);
job
.
setJarByClass
(
InvIndex
.
class
);
job
.
setOutputKeyClass
(
Text
.
class
);
job
.
setOutputValueClass
(
Text
.
class
);
job
.
setMapperClass
(
Map
.
class
);
job
.
setReducerClass
(
Reduce
.
class
);
job
.
setInputFormatClass
(
TextInputFormat
.
class
);
job
.
setOutputFormatClass
(
TextOutputFormat
.
class
);
Path
outputFilePath
=
new
Path
(
args
[
3
]);
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
0
]));
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
1
]));
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
2
]));
FileOutputFormat
.
setOutputPath
(
job
,
outputFilePath
);
/* Delete output filepath if already exists */
FileSystem
fs
=
FileSystem
.
newInstance
(
getConf
());
if
(
fs
.
exists
(
outputFilePath
))
{
fs
.
delete
(
outputFilePath
,
true
);
}
return
job
.
waitForCompletion
(
true
)
?
0
:
1
;
}
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
Text
,
Text
>
{
private
Text
word
=
new
Text
();
private
Text
filename
=
new
Text
();
private
boolean
caseSensitive
=
false
;
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
String
filenameStr
=
((
FileSplit
)
context
.
getInputSplit
()).
getPath
().
getName
();
filename
=
new
Text
(
filenameStr
);
String
line
=
value
.
toString
();
if
(!
caseSensitive
)
{
line
=
line
.
toLowerCase
();
}
StringTokenizer
tokenizer
=
new
StringTokenizer
(
line
);
while
(
tokenizer
.
hasMoreTokens
())
{
word
.
set
(
tokenizer
.
nextToken
());
context
.
write
(
word
,
filename
);
}
}
@Override
protected
void
setup
(
Context
context
)
throws
IOException
,
InterruptedException
{
Configuration
conf
=
context
.
getConfiguration
();
this
.
caseSensitive
=
conf
.
getBoolean
(
"wordcount.case.sensitive"
,
false
);
}
}
public
static
class
Reduce
extends
Reducer
<
Text
,
Text
,
Text
,
Text
>
{
@Override
public
void
reduce
(
final
Text
key
,
final
Iterable
<
Text
>
values
,
final
Context
context
)
throws
IOException
,
InterruptedException
{
StringBuilder
stringBuilder
=
new
StringBuilder
();
for
(
Text
value
:
values
)
{
stringBuilder
.
append
(
value
.
toString
());
if
(
values
.
iterator
().
hasNext
())
{
stringBuilder
.
append
(
", "
);
}
}
context
.
write
(
key
,
new
Text
(
stringBuilder
.
toString
()));
}
}
}
Assign1/src/Question2/InvertedIndex.java
View file @
98159e2b
...
...
@@ -10,7 +10,6 @@ import org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.IntWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
...
...
Assign1/src/Question3/InvertedIndex_Q3.java
0 → 100644
View file @
98159e2b
package
Question3
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
import
org.apache.hadoop.mapreduce.Mapper
;
import
org.apache.hadoop.mapreduce.Reducer
;
import
org.apache.hadoop.mapreduce.lib.input.FileSplit
;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat
;
import
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
public
class
InvertedIndex_Q3
extends
Configured
implements
Tool
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
int
res
=
ToolRunner
.
run
(
new
Configuration
(),
new
InvertedIndex_Q3
(),
args
);
System
.
exit
(
res
);
}
@Override
public
int
run
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
Job
job
=
new
Job
(
getConf
(),
"InvertedIndex_Q3"
);
job
.
setJarByClass
(
InvertedIndex_Q3
.
class
);
job
.
setOutputKeyClass
(
Text
.
class
);
job
.
setOutputValueClass
(
Text
.
class
);
job
.
setMapperClass
(
Map
.
class
);
job
.
setReducerClass
(
Reduce
.
class
);
job
.
setInputFormatClass
(
TextInputFormat
.
class
);
job
.
setOutputFormatClass
(
TextOutputFormat
.
class
);
Path
outputFilePath
=
new
Path
(
args
[
3
]);
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
0
]));
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
1
]));
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
2
]));
FileOutputFormat
.
setOutputPath
(
job
,
outputFilePath
);
FileSystem
fs
=
FileSystem
.
newInstance
(
getConf
());
if
(
fs
.
exists
(
outputFilePath
))
{
fs
.
delete
(
outputFilePath
,
true
);
}
job
.
waitForCompletion
(
true
);
return
0
;
}
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
Text
,
Text
>
{
private
Text
word
=
new
Text
();
private
String
stopwords_file
=
"/home/cloudera/workspace/bpa/Assign1/output_Q1.i/stopwords.csv"
;
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
String
stopwords
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
stopwords_file
)));
Text
filename
=
new
Text
(
((
FileSplit
)
context
.
getInputSplit
())
.
getPath
().
getName
());
for
(
String
token
:
value
.
toString
().
split
(
"\\s+"
))
{
if
(!
stopwords
.
contains
(
token
.
toLowerCase
()))
{
word
.
set
(
token
.
toLowerCase
());
}
}
context
.
write
(
word
,
filename
);
}
}
public
static
enum
MyCount
{
UNIQUE_WORDS
,
};
public
static
class
Reduce
extends
Reducer
<
Text
,
Text
,
Text
,
Text
>
{
@Override
public
void
reduce
(
Text
key
,
Iterable
<
Text
>
values
,
Context
context
)
throws
IOException
,
InterruptedException
{
HashSet
<
String
>
set
=
new
HashSet
<
String
>();
for
(
Text
value
:
values
)
{
set
.
add
(
value
.
toString
());
}
if
(
set
.
size
()
==
1
)
{
context
.
getCounter
(
MyCount
.
UNIQUE_WORDS
).
increment
(
1
);
StringBuilder
builder
=
new
StringBuilder
();
String
prefix
=
""
;
for
(
String
value
:
set
)
{
builder
.
append
(
prefix
);
prefix
=
", "
;
builder
.
append
(
value
);
}
context
.
write
(
key
,
new
Text
(
builder
.
toString
()));
}
}
}
}
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment