快速入门MapReduc① 实现WordCount-白红宇

快速入门MapReduc① 实现WordCount

阅读量：3951 次

发布时间：2019-05-24

本文共 4954 字，大约阅读时间需要 16 分钟。

1.需要处理的数据

hello wordword counthello MapReduce

2.创建maven项目pom.xml


           
                
     
      cloudera
                 
     
      https://repository.cloudera.com/artifactory/cloudera-repos/
             
        
       
           
                
     
      org.apache.Hadoop
                 
     
      Hadoop-client
                 
     
      2.6.0-mr1-cdh5.14.0
             
            
                
     
      org.apache.Hadoop
                 
     
      Hadoop-common
                 
     
      2.6.0-cdh5.14.0
             
            
                
     
      org.apache.Hadoop
                 
     
      Hadoop-hdfs
                 
     
      2.6.0-cdh5.14.0
             
            
                
     
      org.apache.Hadoop
                 
     
      Hadoop-mapreduce-client-core
                 
     
      2.6.0-cdh5.14.0
             
            
                
     
      junit
                 
     
      junit
                 
     
      4.11
                 
     
      test
             
            
                
     
      org.testng
                 
     
      testng
                 
     
      RELEASE
             
        
       
           
                
                     
      
       org.apache.maven.plugins
                      
      
       maven-compiler-plugin
                      
      
       3.0
                      
                          
       1.8                    
       
        1.8
                           
       
        UTF-8
                       
                  
                 
                     
      
       org.apache.maven.plugins
                      
      
       maven-shade-plugin
                      
      
       2.4.3
                      
                          
                               
        
         package
                                
         
         
          shade
          
                                
         
         
          true

3.编写map类

package com.czxy.wordCount;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;public class WordCountMapper extends Mapper
   
     {    @Override    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        // 将 Text类型转换为String 类型        String s = value.toString();        // 安装空格切分        String[] split = s.split(" ");        // 循环遍历输出        for (String s1 : split) {            // 输出 key=单词 value =1            context.write(new Text(s1), new LongWritable(1));        }    }}

4.编写Reduce类

package com.czxy.wordCount;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;public class WordCountReduce extends Reducer
   
     {    @Override    protected void reduce(Text key, Iterable
    
      values, Context context) throws IOException, InterruptedException {        // 定义一个变量用来记录单词出现的次数        int sumCount=0;        for (LongWritable value : values) {            sumCount+=value.get();        }        // 结果数据        context.write(key, new LongWritable(sumCount));    }}

5.编写启动类

package com.czxy.wordCount;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class WordCountDriver extends Configured implements Tool {    @Override    public int run(String[] args) throws Exception {        // 获取job        Job job = Job.getInstance(new Configuration());        //  设置支持jar执行        job.setJarByClass(WordCountDriver.class);        // 设置执行的napper        job.setMapperClass(WordCountMapper.class);        // 设置map输出的key类型        job.setMapOutputKeyClass(Text.class);        // 设置map输出value类型        job.setMapOutputValueClass(LongWritable.class);        // 设置执行的reduce        job.setReducerClass(WordCountReduce.class);        // 设置reduce输出key的类型        job.setOutputKeyClass(Text.class);        // 设置reduce输出value的类型        job.setOutputValueClass(LongWritable.class);        // 设置文件输入        job.setInputFormatClass(TextInputFormat.class);        TextInputFormat.addInputPath(job, new Path("./data/wordCount/"));        // 设置文件输出        job.setOutputFormatClass(TextOutputFormat.class);        TextOutputFormat.setOutputPath(job, new Path("./outPut/wordCount/"));        // 设置启动类        boolean b = job.waitForCompletion(true);        return b ? 0 : 1;    }    public static void main(String[] args) throws Exception {        // 调用启动方法        ToolRunner.run(new WordCountDriver(), args);    }}