BDPA_Assign2_WJIN.md


      public void map(LongWritable key, Text value, Context context)
              throws IOException, InterruptedException {
         for (String token: value.toString().split("\\s+|-{2,}+")) {
        	 word.set(token.replaceAll("[^A-Za-z0-9]+", "").toLowerCase());
            context.write(word, ONE);
         }
      }

      public void map(LongWritable key, Text value, Context context)
              throws IOException, InterruptedException {
    	 Counter counter = context.getCounter(DocLineCounter.NUM);
    	 counter.increment(1);
    	 Set<String> wordSet = new HashSet<String>();
    	 if (value.toString().isEmpty()){
    		 return;
    	 }
         for (String token: value.toString().split("\\s+|-{2,}+")) {
        	 String s = token.replaceAll("[^A-Za-z0-9]+", "");
        	 if (stopWords.contains(s)||(s.isEmpty())){
        		 continue;
        	 }else if(!wordFreq.containsKey(s)){
        		 System.out.println("WARN: HASHTABLE DON'T HAVE WORD:");
        		 System.out.println(s);
        	 }
        	 wordSet.add(s);
         }
         List<String> wordList = new ArrayList<String>(wordSet);

         Collections.sort(wordList, new Comparator<String>() {
        	 @Override
        	 public int compare(String s1, String s2)
        	 {
        		 return  wordFreq.get(s1).compareTo(wordFreq.get(s2));
        	 }
         });
         words.set(StringUtils.join(wordList,","));
         context.write(new LongWritable(counter.getValue()), words);
      }
class LongPair implements WritableComparable<LongPair> {

    private LongWritable first;
    private LongWritable second;

    public LongPair() {
        this.set(new LongWritable(0), new LongWritable(0));
    }

    public LongPair(LongWritable first, LongWritable second) {
        this.set(first, second);
    }

    public LongPair(Long first, Long second) {
        this.set(new LongWritable(first), new LongWritable(second));
    }

    public LongPair(String first, String second) {
        this.set(new LongWritable( new Long(first)), new LongWritable( new Long(second)));
    }

    public LongWritable getFirst() {
        return first;
    }

    public LongWritable getSecond() {
        return second;
    }

    public void set(LongWritable first, LongWritable second) {
        this.first = first;
        this.second = second;
    }

    public void setFirst(LongWritable first){
        this.first = first;
    }

    public void setFirst(Long first){
        this.first = new LongWritable(first);
    }

    public void setSecond(LongWritable second){
        this.second = second;
    }

    public void setSecond(Long second){
        this.second = new LongWritable(second);
    }

    public long getSum(){
    	return this.first.get()+this.second.get();
    }

    public long getDiff(){
    	return Math.abs(this.first.get()-this.second.get());
    }

    public LongPair inverse(){
    	return new LongPair(second, first);
    }

    @Override
    public boolean equals(Object o) {
        if (o instanceof LongPair) {
            LongPair p1 = (LongPair) o;
            boolean b1 = first.equals(p1.first) && second.equals(p1.second);
            LongPair p2 = p1.inverse();
            boolean b2 = first.equals(p2.first) && second.equals(p2.second);
            return b1 || b2;
        }
        return false;
    }

    @Override
    public int compareTo(LongPair other) {
    	long cmp = this.getSum()-other.getSum();
    	long cmp_alter = this.getDiff() - other.getDiff();
    	if(cmp<0){
    		return 1;
    	}else if(cmp>0){
    		return -1;
    	}else if(cmp_alter<0){
    		return 1;
    	}else if(cmp_alter>0){
    		return -1;
    	}
    	return 0;
    }


    @Override
    public void readFields(DataInput in) throws IOException {
        first.readFields(in);
        second.readFields(in);
    }

    @Override
    public void write(DataOutput out) throws IOException {
        first.write(out);
        second.write(out);
    }

    @Override
    public String toString() {
        return first.toString() + "," + second.toString();
    }
	   public double similarity(String t1, String t2) {

		   Set<String> s1 = text2Set(t1);
		   Set<String> s2 = text2Set(t2);

		   Set<String> union = new HashSet<String>(s1);
		   union.addAll(s2);

		   Set<String> intersection = new HashSet<String>(s1);
		   intersection.retainAll(s2);

		   if (union.size()==0){
			   return 0;
		   }

		   return intersection.size()/union.size();
	    }
      @Override
      public void map(Text key, Text value, Context context)
              throws IOException, InterruptedException {

    	 if (value.toString().isEmpty()){
    		 return;
    	 }

    	 String keyOut = key.toString();

    	 if (!StringUtils.isNumeric(keyOut)){
    		 System.out.println("WARN: Bas input id");
    		 System.out.println(keyOut);
    		 return;
    	 }

    	 this.words = value;
    	 this.keyPair.setFirst(Long.parseLong(keyOut));

    	 long counter = 1;

    	 while(counter<=this.fileLength){
    		 this.keyPair.setSecond(counter);

    		 if (this.keyPair.getDiff()==0){
    			 counter += 1;
    			 continue;
    		 }

    		 context.write(keyPair,words);
    		 counter += 1;
    	 }
	  @Override
      public void reduce(LongPair key, Iterable<Text> values, Context context)
              throws IOException, InterruptedException {
    	 int counter = 0;
		 String[] strings =  new String[2];
    	 for (Text v : values){
    		 strings[counter] = v.toString();
    		 counter += 1;
    	 }

    	 if (counter!=2){ // document id not in input file
    		return;
    	 }

    	 double s = similarity(strings[0], strings[1]);
    	 context.getCounter(CompCounter.NUM).increment(1);

    	 if (s>=0.8){
             context.write(new Text(key.toString()), new Text(String.valueOf(s)));
    	 }
      }
      conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(500));
@Override
      public void map(Text key, Text value, Context context)
              throws IOException, InterruptedException {

    	 if (value.toString().isEmpty()){
    		 return;
    	 }

    	 String[] document = value.toString().split(",");
    	 int window = document.length - (int)Math.floor(document.length*0.8);
    	 int counter = 0;

    	 while(counter<window){
    		 word.set(document[counter]);
    		 context.write(word,key);
    		 counter += 1;
    	 }
      }
@Override
      public void reduce(Text key, Iterable<Text> values, Context context)
              throws IOException, InterruptedException {
		 List<Text> val = new ArrayList<Text>();
		 for (Text v :values){
			 val.add(new Text(v));
		 }
    	 for (Text v1 : val){
    		 for (Text v2: val){
    			 if (v1.equals(v2)){
    				 continue;
    			 }

    			 String s1 = this.document.get(v1).toString();
    			 String s2 = this.document.get(v2).toString();

    	    	 context.getCounter(CompCounter.NUM).increment(1);
    			 Double s = similarity(s1, s2);

    			 if (s>=0.8){
    				 context.write(new Text(v1.toString()+','+v2.toString()), new Text(String.valueOf(s)));
    			 }
    		 }
    	 }