hadoop示例程序 WordCount详解

2014-11-24 07:35:52 · 作者: · 浏览: 0

hadoop框架由三部分组成,包括两个类(分别继承于mapper类和reducer类)以及一个主函数.


WordCount示例功能为统计文本中的单个单词出现频率. 输入两个文本,输出为单词和出现次数的对应关系.


下面对代码进行详细注释:

package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

  public static class TokenizerMapper 
       extends Mapper
  
   {
    
    private final static IntWritable one = new IntWritable(1);  //建立"int"型变量one,初值为1
    private Text word = new Text();                             //建立"string:型变量 word,用于接收传入的单词
      
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());  //将输入的文本按行分段
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());                                  //为word赋值
        context.write(word, one);                                   // 将 键-值 对 word one 传入
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer
   
     { private IntWritable result = new IntWritable(); //创建整型变量result public void reduce(Text key, Iterable
    
      values, Context context ) throws IOException, InterruptedException { int sum = 0; //创建int 型变量sum 初值0 for (IntWritable val : values) { sum += val.get(); //将每个key对应的所有value类间 } result.set(sum); //sum传入result context.write(key, result); //将 key-result对传入 } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount 
      
      
       "); System.exit(2); } Job job = new Job(conf, "word count"); //建立新job job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); //设置map类 job.setCombinerClass(IntSumReducer.class); //设置combiner类 job.setReducerClass(IntSumReducer.class); //设置reducer类 job.setOutputKeyClass(Text.class); //输出的key类型 job.setOutputValueClass(IntWritable.class); //输出的value类型 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //输入输出参数(在设置中指定) FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true)   0 : 1); } }
      
     
    
   
  

WordCount相当于java 中的helloword, 是hadoop的入门基础,让初学者简单的感受hadoop是怎么工作的. 后续学习任重而道远.