BDPA_Assign2_WJIN.md


      public void map(LongWritable key, Text value, Context context)
              throws IOException, InterruptedException {
         for (String token: value.toString().split("\\s+|-{2,}+")) {
        	 word.set(token.replaceAll("[^A-Za-z0-9]+", "").toLowerCase());
            context.write(word, ONE);
         }
      }

      public void map(LongWritable key, Text value, Context context)
              throws IOException, InterruptedException {
    	 Counter counter = context.getCounter(DocLineCounter.NUM);
    	 counter.increment(1);
    	 Set<String> wordSet = new HashSet<String>();
    	 if (value.toString().isEmpty()){
    		 return;
    	 }
         for (String token: value.toString().split("\\s+|-{2,}+")) {
        	 String s = token.replaceAll("[^A-Za-z0-9]+", "");
        	 if (stopWords.contains(s)||(s.isEmpty())){
        		 continue;
        	 }else if(!wordFreq.containsKey(s)){
        		 System.out.println("WARN: HASHTABLE DON'T HAVE WORD:");
        		 System.out.println(s);
        	 }
        	 wordSet.add(s);
         }
         List<String> wordList = new ArrayList<String>(wordSet);

         Collections.sort(wordList, new Comparator<String>() {
        	 @Override
        	 public int compare(String s1, String s2)
        	 {
        		 return  wordFreq.get(s1).compareTo(wordFreq.get(s2));
        	 }
         });
         words.set(StringUtils.join(wordList,","));
         context.write(new LongWritable(counter.getValue()), words);
      }
class LongPair implements WritableComparable<LongPair> {

    private LongWritable first;
    private LongWritable second;

    public LongPair() {
        this.set(new LongWritable(0), new LongWritable(0));
    }

    public LongPair(LongWritable first, LongWritable second) {
        this.set(first, second);
    }

    public LongPair(Long first, Long second) {
        this.set(new LongWritable(first), new LongWritable(second));
    }

    public LongPair(String first, String second) {
        this.set(new LongWritable( new Long(first)), new LongWritable( new Long(second)));
    }

    public LongWritable getFirst() {
        return first;
    }

    public LongWritable getSecond() {
        return second;
    }

    public void set(LongWritable first, LongWritable second) {
        this.first = first;
        this.second = second;
    }

    public void setFirst(LongWritable first){
        this.first = first;
    }

    public void setFirst(Long first){
        this.first = new LongWritable(first);
    }

    public void setSecond(LongWritable second){
        this.second = second;
    }

    public void setSecond(Long second){
        this.second = new LongWritable(second);
    }

    public long getSum(){
    	return this.first.get()+this.second.get();
    }

    public long getDiff(){
    	return Math.abs(this.first.get()-this.second.get());
    }

    public LongPair inverse(){
    	return new LongPair(second, first);
    }

    @Override
    public boolean equals(Object o) {
        if (o instanceof LongPair) {
            LongPair p1 = (LongPair) o;
            boolean b1 = first.equals(p1.first) && second.equals(p1.second);
            LongPair p2 = p1.inverse();
            boolean b2 = first.equals(p2.first) && second.equals(p2.second);
            return b1 || b2;
        }
        return false;
    }

    @Override
    public int compareTo(LongPair other) {
    	long cmp = this.getSum()-other.getSum();
    	long cmp_alter = this.getDiff() - other.getDiff();
    	if(cmp<0){
    		return 1;
    	}else if(cmp>0){
    		return -1;
    	}else if(cmp_alter<0){
    		return 1;
    	}else if(cmp_alter>0){
    		return -1;
    	}
    	return 0;
    }


    @Override
    public void readFields(DataInput in) throws IOException {
        first.readFields(in);
        second.readFields(in);
    }

    @Override
    public void write(DataOutput out) throws IOException {
        first.write(out);
        second.write(out);
    }

    @Override
    public String toString() {
        return first.toString() + "," + second.toString();
    }
	   public double similarity(String t1, String t2) {

		   Set<String> s1 = text2Set(t1);
		   Set<String> s2 = text2Set(t2);

		   Set<String> union = new HashSet<String>(s1);
		   union.addAll(s2);

		   Set<String> intersection = new HashSet<String>(s1);
		   intersection.retainAll(s2);

		   if (union.size()==0){
			   return 0;
		   }

		   return intersection.size()/union.size();
	    }