Hive编码实战

## 通常都会使用Word Count作为用户学习使用Java编写MapReduce程序的例子，因为这样用户可以关注于API。因此，Word Count已经成为Hadoop世界中的“Hello World”程序了。

Apache Hadoop 分支版本中包含有下面的这个Java实现

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class WordCount {

public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
　　private final static IntWritable one = new IntWritable(1);
　　private Text word = new Text();

　　public void map(LongWritable key, Text value, Context context)
　　　throws IOException, InterruptedException {
　　　　String line = value.toString();
　　　　StringTokenizer tokenizer = new StringTokenizer(line);
　　　　while (tokenizer.hasMoreTokens()) {
　　　　　　word.set(tokenizer.nextToken());
　　　　　　context.write(word, one);
　　　　　}
　　　}
}

　　public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
　　　　public void reduce(Text key, Iterable<IntWritable> values, Context context)
　　　　　throws IOException, InterruptedException {
　　　　　　int sum = 0;
　　　　　　for (IntWritable val : values) {
　　　　　　　　sum += val.get();
　　　　　　　}
　　　　　　　context.write(key, new IntWritable(sum));
　　　　}
}

public static void main(String[] args) throws Exception {
　　Configuration conf = new Configuration();

　　Job job = new Job(conf, "wordcount");

　　job.setOutputKeyClass(Text.class);
　　job.setOutputValueClass(IntWritable.class);

　　job.setMapperClass(Map.class);
　　job.setReducerClass(Reduce.class);

　　job.setInputFormatClass(TextInputFormat.class);
　　job.setOutputFormatClass(TextOutputFormat.class);

　　FileInputFormat.addInputPath(job, new Path(args[0]));
　　FileOutputFormat.setOutputPath(job, new Path(args[1]));

　　job.waitForCompletion(true);
　}
}

如下是使用HiveQL进行的相同的运算，不需要进行编译然后生成一个“JAR”（Java压缩包）文件。

CREATE TABLE docs (line STRING);

LOAD DATA INPATH 'docs' OVERWRITE INTO TABLE docs;
CREATE TABLE word*counts AS
SELECT word, count(1) AS count FROM
　(SELECT explode(split(line, '\s')) AS word FROM docs) w
GROUP BY word
ORDER BY word;

yangjava

Hive编码实战

Hive编码实战

Search

微信好友

Table of Contents