介是啥米东东

使用Hadoop实现统计一篇文章中单词数和出现的次数

大数据 admin 2159℃ 0评论

Hadoop提供了一个测试程序来统计单词次数。hadoop-examples-*.jar,可以通过执行hadoop jar hadoop-examples-*.jar 待测的目录 输出目录 来执行。但是该程序仅仅是通过空格来分割出单词,会出现hello,how 这样的单词。现在修改通过修改程序来实现通过空格,逗号,回车等来分离单词,并且只统计数量大于3的单词。

原始程序:

[java]
package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}

public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
[/java]

Hadoop默认一次处理一行数据。IntWritable,LongWritebale和Text是MapReduce实现的类,用来增加hadoop的串行化能力。
StringTokenizer itr = new StringTokenizer(value.toString());
StringTokenizer默认是基于空格来分词,我们可以通过它的其他构造方法来自定义分词方式:
StringTokenizer itr = new StringTokenizer(value.toString(),” \t\n\r\f,.。,:;?![]'”);
设置忽略大小写
word.set(itr.nextToken().toLowerCase());
设置大于3次的进行统计
if(sum>3) context.write(key, result);
创建wcount wcount/src wcount/classes三个目录
将WordCount.java放到wcount/src目录下
最终的:

[java]

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString()," \t\n\r\f,.:;[]’");
while (itr.hasMoreTokens()) {
word.set(itr.nextToken().toLowerCase());
context.write(word, one);
}
}
}

public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
if(sum>3) context.write(key, result);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

[/java]

执行javac -classpath hadoop-core-*.jar -d classes -src/WordCount.java (说明:hadoop-core-*.jar在2.2里面我没有找到jar,可以找到源码,不想自己打jar就去1.*里面找,,,)
正常情况是找不到一个类。cli开始以为是版本的问题,搞了好久都没解决,之后之后就直接下了一个,,o(╯□╰)o
360软件小助手截图20140511133105

wget https://repository.apache.org/content/repositories/snapshots/commons-cli/commons-cli/1.3-SNAPSHOT/commons-cli-1.3-20140221.042039-102.jar
太长了改个名字,执行

javac -classpath cli.jar:hadoop-core-*.jar -d classes -src/WordCount.java
然后打成jar
jar -cvf wc.jar -C classes/ .
好了一般不出问题的话就会在当前文件夹生成一个wc.jar
然后准备要统计的文字,先英文吧,统计一下美国国情咨文报告吧(http://language.chinadaily.com.cn/article-186522-1.html),把内容写到unionAddress.txt
在hdfs新建一个文件夹ua
hadoop fs -mkdir /ua
把文件put到/ua中
hadoop fs -put /ua
开始运行:
hadoop jar wc.jar  org.apache.hadoop.examples.WordCount /ua /uaout
360软件小助手截图20140511140342
执行的部分结果:
360软件小助手截图20140511140442
统计出来的数据:
360软件小助手截图20140511140612

其他:
1.上传东西的时候报org.apache.hadoop.dfs.SafeModeException: Cannot delete /user/hadoop/input. Name node is in safe mode 错误,原因是之前用ctrl+c中之国Hadoop,导致NameNode进入安全模式,使用hadoop dfsadmin -safemode leave 可以退出安全模式。
2之前运行过其他版本的Hadoop,出现“Incompatible namespaceIDs in /tmp/hadoop-root/dfs/data”,是由于 /tmp/hadoop-root/dfs/data中的namespaceIDs不兼容导致的,也就是说,很可能是由于上次运行其它版本的Hadoop在/tmp/hadoop-root/dfs/data目录下有残留的不兼容的数据。清理对应目录的数据以后,就可以正常运行了。在tmp文件夹下。
3.NameNode无法启动,是因为我蛋疼的把tmp全部删掉了,好吧,tmp里面还保存着点有用的信心(namenode的格式化信息?),重新格式化一下就行了。。。。hadoop namenode -format
360软件小助手截图20140511135624

转载请注明:Z/RANDY » 使用Hadoop实现统计一篇文章中单词数和出现的次数

喜欢 (0)or分享 (0)
发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址