Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mohammed Meftah
BDPA_Assign2_MMEFTAH
Commits
9329fa10
Commit
9329fa10
authored
Mar 19, 2017
by
cloudera_vm
Browse files
Efficient Method
parent
efdea1c5
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Assign2/Qb_Efficient_1000/._SUCCESS.crc
0 → 100644
View file @
9329fa10
File added
Assign2/Qb_Efficient_1000/.part-r-00000.crc
0 → 100644
View file @
9329fa10
File added
Assign2/Qb_Efficient_1000/_SUCCESS
0 → 100644
View file @
9329fa10
Assign2/Qb_Efficient_1000/nb_comp
0 → 100644
View file @
9329fa10
2833
\ No newline at end of file
Assign2/Qb_Efficient_1000/part-r-00000
0 → 100644
View file @
9329fa10
1195--2228 1.0
1195--7082 1.0
1255--2286 0.9
1255--7142 1.0
1322--7209 1.0
1391--7278 1.0
1456--7343 1.0
1519--7406 1.0
1575--7462 1.0
1642--7529 1.0
2175--6998 0.8
2228--7082 1.0
2286--7142 0.9
Assign2/bin/SetSimilarityJoins/Qb_Efficient_1000$COUNTS.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_Efficient_1000$Map.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_Efficient_1000$Reduce.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_Efficient_1000.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_invert_index$COUNTS.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_invert_index$Map.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_invert_index$Reduce.class
0 → 100644
View file @
9329fa10
File added
Assign2/bin/SetSimilarityJoins/Qb_invert_index.class
0 → 100644
View file @
9329fa10
File added
Assign2/hadoop.log
View file @
9329fa10
This diff is collapsed.
Click to expand it.
Assign2/src/SetSimilarityJoins/Qb_Efficient_1000.java
0 → 100644
View file @
9329fa10
package
SetSimilarityJoins
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Arrays
;
import
java.util.Comparator
;
import
java.util.HashMap
;
import
java.util.Collections
;
import
java.util.HashSet
;
import
java.util.Iterator
;
import
java.util.LinkedHashSet
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.Map.Entry
;
import
java.util.Set
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.DoubleWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.NullWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
import
org.apache.hadoop.mapreduce.Mapper
;
import
org.apache.hadoop.mapreduce.Reducer
;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat
;
import
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
import
com.google.common.collect.Sets
;
public
class
Qb_Efficient_1000
extends
Configured
implements
Tool
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
int
res
=
ToolRunner
.
run
(
new
Configuration
(),
new
Qb_Efficient_1000
(),
args
);
System
.
exit
(
res
);
}
public
static
enum
COUNTS
{
COUNT_COMP
};
@Override
public
int
run
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
Job
job
=
new
Job
(
getConf
(),
"Qb_Efficient_1000"
);
job
.
setJarByClass
(
Qb_Efficient_1000
.
class
);
job
.
setOutputKeyClass
(
Text
.
class
);
job
.
setOutputValueClass
(
Text
.
class
);
job
.
setMapperClass
(
Map
.
class
);
job
.
setReducerClass
(
Reduce
.
class
);
job
.
setInputFormatClass
(
TextInputFormat
.
class
);
job
.
setOutputFormatClass
(
TextOutputFormat
.
class
);
job
.
getConfiguration
().
set
(
"mapreduce.input.keyvaluelinerecordreader.key.value.separator"
,
","
);
Path
outputFilePath
=
new
Path
(
args
[
1
]);
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
0
]));
FileOutputFormat
.
setOutputPath
(
job
,
outputFilePath
);
FileSystem
fs
=
FileSystem
.
newInstance
(
getConf
());
if
(
fs
.
exists
(
outputFilePath
))
{
fs
.
delete
(
outputFilePath
,
true
);
}
job
.
waitForCompletion
(
true
);
long
counter
=
job
.
getCounters
().
findCounter
(
COUNTS
.
COUNT_COMP
).
getValue
();
Path
countFile
=
new
Path
(
new
Path
(
args
[
1
]),
"nb_comp"
);
File
file
=
new
File
(
countFile
.
toString
());
FileWriter
fileWriter
=
new
FileWriter
(
file
);
fileWriter
.
write
(
String
.
valueOf
(
counter
));
fileWriter
.
flush
();
fileWriter
.
close
();
return
0
;
}
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
Text
,
Text
>
{
String
doc_path
=
"/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"
;
String
doc
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
doc_path
)));
HashMap
<
String
,
String
>
id_doc
=
new
HashMap
<
String
,
String
>();
public
Map
()
throws
IOException
{
for
(
String
line
:
doc
.
split
(
"\n"
)){
id_doc
.
put
(
line
.
split
(
","
)[
0
],
line
.
split
(
","
)[
1
]);
/*
*/
}
}
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
String
id_current
=
value
.
toString
().
split
(
","
)[
0
];
String
val_current
=
value
.
toString
().
split
(
","
)[
0
];
int
id_current_doc
=
Integer
.
valueOf
(
id_current
);
int
max_words
=
val_current
.
split
(
" "
).
length
-
(
int
)
Math
.
ceil
(
0.8
*
val_current
.
split
(
" "
).
length
)
+
1
;
for
(
String
other_doc
:
id_doc
.
keySet
())
{
String
other_doc_val
=
id_doc
.
get
(
other_doc
);
int
id_other_doc
=
Integer
.
valueOf
(
other_doc
);
int
m
=
0
;
for
(
String
word
:
id_doc
.
get
(
id_current
).
split
(
" "
)){
if
(
m
>
max_words
){
break
;}
if
(
other_doc_val
.
contains
(
word
)
&&
id_current_doc
<
id_other_doc
){
StringBuilder
pair
=
new
StringBuilder
();
pair
.
append
(
id_current_doc
);
pair
.
append
(
"--"
);
pair
.
append
(
id_other_doc
);
context
.
write
(
new
Text
(
pair
.
toString
()),
new
Text
(
value
.
toString
().
split
(
","
)[
1
].
toLowerCase
()));
break
;
}
m
++;
}
}
}
}
public
static
class
Reduce
extends
Reducer
<
Text
,
Text
,
Text
,
DoubleWritable
>
{
String
doc_path
=
"/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"
;
String
doc
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
doc_path
)));
HashMap
<
String
,
String
>
id_doc
=
new
HashMap
<
String
,
String
>();
public
Reduce
()
throws
IOException
{
for
(
String
line
:
doc
.
split
(
"\n"
)){
id_doc
.
put
(
line
.
split
(
","
)[
0
],
line
.
split
(
","
)[
1
]);
}
}
public
static
double
Jaccard
(
String
[]
A
,
String
[]
B
){
Set
<
String
>
A_set
=
new
HashSet
<
String
>(
Arrays
.
asList
(
A
));
Set
<
String
>
B_set
=
new
HashSet
<
String
>(
Arrays
.
asList
(
B
));
Set
<
String
>
union
=
Sets
.
union
(
A_set
,
B_set
);
Set
<
String
>
intersection
=
Sets
.
intersection
(
A_set
,
B_set
);
return
(
double
)
intersection
.
size
()/(
double
)
union
.
size
();
}
@Override
public
void
reduce
(
Text
key
,
Iterable
<
Text
>
values
,
Context
context
)
throws
IOException
,
InterruptedException
{
String
[]
ids
=
key
.
toString
().
split
(
"--"
);
String
content_1
=
id_doc
.
get
(
ids
[
0
]).
toLowerCase
();
String
content_2
=
id_doc
.
get
(
ids
[
1
]).
toLowerCase
();
double
jaccsim
=
Jaccard
(
content_1
.
split
(
" "
),
content_2
.
split
(
" "
));
if
(
jaccsim
>=
0.8
){
context
.
write
(
key
,
new
DoubleWritable
(
jaccsim
));
}
context
.
getCounter
(
COUNTS
.
COUNT_COMP
).
increment
(
1
);
}
}
}
Assign2/src/SetSimilarityJoins/Qb_invert_index.java
0 → 100644
View file @
9329fa10
package
SetSimilarityJoins
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Arrays
;
import
java.util.Comparator
;
import
java.util.HashMap
;
import
java.util.Collections
;
import
java.util.HashSet
;
import
java.util.Iterator
;
import
java.util.LinkedHashSet
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.Map.Entry
;
import
java.util.Set
;
import
org.apache.hadoop.conf.Configuration
;
import
org.apache.hadoop.conf.Configured
;
import
org.apache.hadoop.fs.FileSystem
;
import
org.apache.hadoop.fs.Path
;
import
org.apache.hadoop.io.DoubleWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.NullWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Job
;
import
org.apache.hadoop.mapreduce.Mapper
;
import
org.apache.hadoop.mapreduce.Reducer
;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat
;
import
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
;
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
import
com.google.common.collect.Sets
;
public
class
Qb_invert_index
extends
Configured
implements
Tool
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
int
res
=
ToolRunner
.
run
(
new
Configuration
(),
new
Qb_invert_index
(),
args
);
System
.
exit
(
res
);
}
public
static
enum
COUNTS
{
COUNT_COMP
};
@Override
public
int
run
(
String
[]
args
)
throws
Exception
{
System
.
out
.
println
(
Arrays
.
toString
(
args
));
Job
job
=
new
Job
(
getConf
(),
"Qb_Efficient_1000"
);
job
.
setJarByClass
(
Qb_invert_index
.
class
);
job
.
setOutputKeyClass
(
Text
.
class
);
job
.
setOutputValueClass
(
Text
.
class
);
job
.
setMapperClass
(
Map
.
class
);
job
.
setReducerClass
(
Reduce
.
class
);
job
.
setInputFormatClass
(
TextInputFormat
.
class
);
job
.
setOutputFormatClass
(
TextOutputFormat
.
class
);
job
.
getConfiguration
().
set
(
"mapreduce.input.keyvaluelinerecordreader.key.value.separator"
,
","
);
Path
outputFilePath
=
new
Path
(
args
[
1
]);
FileInputFormat
.
addInputPath
(
job
,
new
Path
(
args
[
0
]));
FileOutputFormat
.
setOutputPath
(
job
,
outputFilePath
);
FileSystem
fs
=
FileSystem
.
newInstance
(
getConf
());
if
(
fs
.
exists
(
outputFilePath
))
{
fs
.
delete
(
outputFilePath
,
true
);
}
job
.
waitForCompletion
(
true
);
long
counter
=
job
.
getCounters
().
findCounter
(
COUNTS
.
COUNT_COMP
).
getValue
();
Path
countFile
=
new
Path
(
new
Path
(
args
[
1
]),
"nb_comp"
);
File
file
=
new
File
(
countFile
.
toString
());
FileWriter
fileWriter
=
new
FileWriter
(
file
);
fileWriter
.
write
(
String
.
valueOf
(
counter
));
fileWriter
.
flush
();
fileWriter
.
close
();
return
0
;
}
public
static
class
Map
extends
Mapper
<
LongWritable
,
Text
,
Text
,
Text
>
{
String
doc_path
=
"/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"
;
String
doc
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
doc_path
)));
HashMap
<
String
,
String
>
id_doc
=
new
HashMap
<
String
,
String
>();
public
Map
()
throws
IOException
{
for
(
String
line
:
doc
.
split
(
"\n"
)){
id_doc
.
put
(
line
.
split
(
","
)[
0
],
line
.
split
(
","
)[
1
]);
/*
*/
}
}
@Override
public
void
map
(
LongWritable
key
,
Text
value
,
Context
context
)
throws
IOException
,
InterruptedException
{
int
id_current_doc
=
Integer
.
valueOf
(
value
.
toString
().
split
(
","
)[
0
]);
for
(
String
other_doc
:
id_doc
.
keySet
())
{
int
id_other_doc
=
Integer
.
valueOf
(
other_doc
);
if
(
id_current_doc
<
id_other_doc
){
StringBuilder
pair
=
new
StringBuilder
();
pair
.
append
(
id_current_doc
);
pair
.
append
(
"--"
);
pair
.
append
(
id_other_doc
);
context
.
write
(
new
Text
(
pair
.
toString
()),
new
Text
(
value
.
toString
().
split
(
","
)[
1
].
toLowerCase
()));
}
}
}
}
public
static
class
Reduce
extends
Reducer
<
Text
,
Text
,
Text
,
DoubleWritable
>
{
String
doc_path
=
"/home/cloudera/workspace/bpa/Assign2/Preprocessing_1/processed_doc_1000"
;
String
doc
=
new
String
(
Files
.
readAllBytes
(
Paths
.
get
(
doc_path
)));
HashMap
<
String
,
String
>
id_doc
=
new
HashMap
<
String
,
String
>();
public
Reduce
()
throws
IOException
{
for
(
String
line
:
doc
.
split
(
"\n"
)){
id_doc
.
put
(
line
.
split
(
","
)[
0
],
line
.
split
(
","
)[
1
]);
}
}
public
static
double
Jaccard
(
String
[]
A
,
String
[]
B
){
Set
<
String
>
A_set
=
new
HashSet
<
String
>(
Arrays
.
asList
(
A
));
Set
<
String
>
B_set
=
new
HashSet
<
String
>(
Arrays
.
asList
(
B
));
Set
<
String
>
union
=
Sets
.
union
(
A_set
,
B_set
);
Set
<
String
>
intersection
=
Sets
.
intersection
(
A_set
,
B_set
);
return
(
double
)
intersection
.
size
()/(
double
)
union
.
size
();
}
@Override
public
void
reduce
(
Text
key
,
Iterable
<
Text
>
values
,
Context
context
)
throws
IOException
,
InterruptedException
{
String
[]
ids
=
key
.
toString
().
split
(
"--"
);
String
content_1
=
id_doc
.
get
(
ids
[
0
]).
toLowerCase
();
String
content_2
=
id_doc
.
get
(
ids
[
1
]).
toLowerCase
();
double
jaccsim
=
Jaccard
(
content_1
.
split
(
" "
),
content_2
.
split
(
" "
));
if
(
jaccsim
>=
0.8
){
context
.
write
(
key
,
new
DoubleWritable
(
jaccsim
));
}
context
.
getCounter
(
COUNTS
.
COUNT_COMP
).
increment
(
1
);
}
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment