大数据学习（八）：mapreduce编程案例-倒排索引创建

Ada ·

更新时间:2024-11-15

· 594 次阅读

需求

有如下数据
a.txt

hello tom
hello jim
hello kitty
hello rose

b.txt

hello jerry
hello jim
hello kitty
hello jack

c.txt

hello jerry
hello java
hello c++
hello c++

需要输出如下格式:

c++	c.txt-->2	
hello	a.txt-->4	b.txt-->4	c.txt-->4	
jack	b.txt-->1	
java	c.txt-->1	
jerry	b.txt-->1	c.txt-->1	
jim	a.txt-->1	b.txt-->1	
kitty	a.txt-->1	b.txt-->1	
rose	a.txt-->1	
tom	a.txt-->1

思路

1、先写一个mr程序：统计出每个单词在每个文件中的总次数
hello-a.txt 4
hello-b.txt 4
hello-c.txt 4
java-c.txt 1
jerry-b.txt 1
jerry-c.txt 1

要点1：map方法中，如何获取所处理的这一行数据所在的文件名？
worker在调map方法时，会传入一个context，而context中包含了这个worker所读取的数据切片信息，而切片信息又包含这个切片所在的文件信息
那么，就可以在map中：
FileSplit split = context.getInputSplit();
String fileName = split.getpath().getName();

2、然后在写一个mr程序，读取上述结果数据：
map：根据-切，以单词做key，后面一段作为value
reduce：拼接values里面的每一段，以单词做key，拼接结果做value，输出即可

第一个mr程序计算每个单词在文件出现总次数

package com.bigdata.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class IndexStepOne {
	public static class IndexStepOneMapper extends Mapper {
		// 产生
		@Override
		protected void map(LongWritable key, Text value, Mapper.Context context)
				throws IOException, InterruptedException {
			// 从输入切片信息中获取当前正在处理的一行数据所属的文件
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			String fileName = inputSplit.getPath().getName();
			String[] words = value.toString().split(" ");
			for (String word : words) {
				// 将"单词-文件名"作为key,1作为value,输出
				context.write(new Text(word + "-" + fileName), new IntWritable(1));
			}
		}
	}
	public static class IndexStepOneReducer extends Reducer {
		@Override
		protected void reduce(Text key, Iterable values,
				Reducer.Context context) throws IOException, InterruptedException {
			int count = 0;
			for (IntWritable value : values) {
				count += value.get();
			}
			context.write(key, new IntWritable(count));
		}
	}
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(IndexStepOne.class);
		job.setMapperClass(IndexStepOneMapper.class);
		job.setReducerClass(IndexStepOneReducer.class);
		job.setNumReduceTasks(1);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.setInputPaths(job, new Path("D:\\mrdata\\index\\input"));
		FileOutputFormat.setOutputPath(job, new Path("D:\\mrdata\\index\\output"));
		job.waitForCompletion(true);
	}
}

输出结果如下:

c++-c.txt	2
hello-a.txt	4
hello-b.txt	4
hello-c.txt	4
jack-b.txt	1
java-c.txt	1
jerry-b.txt	1
jerry-c.txt	1
jim-a.txt	1
jim-b.txt	1
kitty-a.txt	1
kitty-b.txt	1
rose-a.txt	1
tom-a.txt	1

第二个mr程序根据第一个默认程序计算出的数据进行切割、拼接

package com.bigdata.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class IndexStepTwo {
	public static class IndexStepTwoMapper extends Mapper {
		//组合数据: 4> 2>
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			 String[] split = value.toString().split("-");
			 context.write(new Text(split[0]), new Text(split[1].replaceAll("\t", "-->")));
		}
	}
	public static class IndexStepTwoReducer extends Reducer {
		@Override
		protected void reduce(Text key, Iterable values,
				Context context) throws IOException, InterruptedException {
			StringBuilder sb = new StringBuilder();
			for (Text value : values) {
				sb.append(value.toString()).append("\t");
			}
			context.write(key, new Text(sb.toString()));
		}
	}
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(IndexStepTwo.class);
		job.setMapperClass(IndexStepTwoMapper.class);
		job.setReducerClass(IndexStepTwoReducer.class);
		job.setNumReduceTasks(1);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.setInputPaths(job, new Path("D:\\mrdata\\index\\output"));
		FileOutputFormat.setOutputPath(job, new Path("D:\\mrdata\\index\\output1"));
		job.waitForCompletion(true);
	}
}

输出结果如下

c++	c.txt-->2	
hello	a.txt-->4	b.txt-->4	c.txt-->4	
jack	b.txt-->1	
java	c.txt-->1	
jerry	b.txt-->1	c.txt-->1	
jim	a.txt-->1	b.txt-->1	
kitty	a.txt-->1	b.txt-->1	
rose	a.txt-->1	
tom	a.txt-->1

作者：qq_33252988

倒排索引数据学习 mapreduce 索引大数据

1024 个赞