TFIDF code

April 05, 2015

TFIDF is 3 pair of map-reduce program
TF&DF can be achieved from simple map-reduce tasks, here posted the third map reduce task to find out the TFIDF final deliverables.

Custom Key

public class Key implements WritableComparable<Key>{

Text word;
IntWritable type;
public Key() {
word = new Text();
type = new IntWritable();
}

public Text getWord() {
return word;
}

public void setWord(Text word) {
this.word = word;
}

public IntWritable getType() {
return type;
}

public void setType(IntWritable type) {
this.type = type;
}

@Override
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
this.word.readFields(arg0);
this.type.readFields(arg0);
}

@Override
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
this.word.write(arg0);
this.type.write(arg0);
}

@Override
public int compareTo(Key o) {
// TODO Auto-generated method stub
int cmp = this.word.compareTo(o.getWord());
if(cmp == 0){
cmp = this.type.compareTo(o.getType());
}
return cmp;
}

}

Custom Value

public class Value implements Writable{

Text filename;

IntWritable tf;

IntWritable df;

public Value() {

// TODO Auto-generated constructor stub

filename=new Text();

tf=new IntWritable();

df=new IntWritable();

}

@Override

public void readFields(DataInput in) throws IOException {

// TODO Auto-generated method stub

this.filename.readFields(in);

this.tf.readFields(in);

this.df.readFields(in);

}

@Override

public void write(DataOutput out) throws IOException {

// TODO Auto-generated method stub

this.filename.write(out);

this.tf.write(out);

this.df.write(out);

}

public Text getFilename() {

return filename;

}

public void setFilename(Text filename) {

this.filename = filename;

}

public IntWritable getTf() {

return tf;

}

public void setTf(IntWritable tf) {

this.tf = tf;

}

public IntWritable getDf() {

return df;

}

public void setDf(IntWritable df) {

this.df = df;

}

Mapper

public class Mappe extends Mapper<LongWritable, Text, Key, Value>{

int type = 0;

Key k = new Key();

Value v = new Value();

@Override

protected void setup(Context context)throws IOException, InterruptedException {

Path fileName = ((FileSplit)context.getInputSplit()).getPath();

String file = fileName.getName();

if(file.equalsIgnoreCase("TF")){

type = 1;

}else{

type = 0;

}

@Override

protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {

String[] splits = value.toString().split("\\W+");

if(type == 1){

k.setWord(new Text(splits[0]));

k.setType(new IntWritable(type));

v.setFilename(new Text(splits[1]));

v.setTf(new IntWritable(Integer.parseInt(splits[2])));

}else{

k.setWord(new Text(splits[0]));

k.setType(new IntWritable(type));

v.setDf(new IntWritable(Integer.parseInt(splits[1])));

}

context.write(k, v);

}

Grouping Comparator

public class GrpCMP extends WritableComparator{

protected GrpCMP() {

super(Key.class,true);

}

@Override

public int compare(WritableComparable a, WritableComparable b) {

// TODO Auto-generated method stub

Key p1= (Key) a;

Key p2= (Key) b;

int cmp = p1.getWord().compareTo(p2.getWord());

return cmp;

}

Reducer

public class Reduce extends Reducer<Key, Value, Text, Text> {

@Override

protected void reduce(Key k, Iterable<Value> v,Context context)throws IOException, InterruptedException {

Iterator<Value> itr = v.iterator();

Value val = itr.next();

int Df = val.getDf().get();

Double a = (double) (5/Df);

while(itr.hasNext()){

Value val1=itr.next();

String file=val1.getFilename().toString();

int Tf=val1.getTf().get();

Double logval = Math.log(a) * Tf;

context.write(new Text(k.getWord()+" \t\t:\t "+file),new Text("\t\t"+logval.toString()));

}

Comments

RaveenaNovember 12, 2018 at 3:46 AM
I feel really happy to have seen your webpage and look forward to so many more entertaining
times reading here. Thanks once more for all the details.
Big Data training in Chennai
ReplyDelete
Replies

Add comment

Search This Blog

Hadoop stuffs....:)

TFIDF code

Comments

Post a Comment

Popular posts from this blog

Hadoop 1 Vs Hadoop 2

Secondary NameNode check-pointing process

Failover and fencing