Mapreduce output showing all records in same line - elastic-map-reduce

I have implemented a mapreduce operation for log file using amazon and hadoop with custom jar.
My output shows the correct keys and values, but all the records are being displayed in a single line. For example, given the following pairs:
<1387, 2>
<1388, 1>
This is what's printing:
1387 21388 1
This is what I'm expecting:
1387 2
1388 1
How can I fix this?

Cleaned up your code for you :)
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(LogAnalyzer.class);
conf.setJobName("Loganalyzer");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(LogAnalyzer.Map.class);
conf.setCombinerClass(LogAnalyzer.Reduce.class);
conf.setReducerClass(LogAnalyzer.Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.set("mapreduce.textoutputformat.separator", "--");
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = ((Text) value).toString();
Matcher matcher = p.matcher(line);
if (matcher.matches()) {
String timestamp = matcher.group(4);
minute.set(getMinuteBucket(timestamp));
output.collect(minute, ONE); //context.write(minute, one);
}
}
This isn't hadoop-streaming, it's just a normal java job. You should amend the tag on the question.
This looks okay to me, although you don't have the mapper inside a class, which I assume is a copy/paste omission.
With regards to the line endings. I don't suppose you are looking at the output on Windows? It could be a problem with unix/windows line endings. If you open up the file in sublime or another advanced text editor you can switch between unix and windows. See if that works.

Related

String concatenation in mapper class of a MapReduce Program giving errors

In my mapper class I want to do a small manipulation to a string read from a file(as a line) and then send it over to the reducer to get a string count. The manipulation being replace null strings with 0. (the current replace & join part is failing my hadoop job)
Here is my code:
import java.io.BufferedReader;
import java.io.IOException;
.....
public class PartNumberMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private static Text partString = new Text("");
private final static IntWritable count = new IntWritable(1);
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
// Read line by line to bufferreader and output the (line,count) pair
BufferedReader bufReader = new BufferedReader(new StringReader(line));
String l=null;
while( (l=bufReader.readLine()) != null )
{
/**** This part is the problem ****/
String a[]=l.split(",");
if(a[1]==""){ // if a[1] i.e. second string is "" then set it to "0"
a[1]="0";
l = StringUtils.join(",", a); // join the string array to form a string
}
/**** problematic part ends ****/
partString.set(l);
output.collect(partString, count);
}
}
}
After this is run, the mapper just fails and doesn't post any errors.
[The code is run with yarn]
I am not sure what I am doing wrong, the same code worked without the string join part.
Could any of you explain what is wrong with the string replace/concat? Is there a better way to do it?
Here's a modified version of your Mapper class with a few changes:
Remove the BufferedReader, it seems redundant and isn't being closed
String equality should be .equals() and not ==
Declare a String array using String[] and not String a[]
Resulting in the following code:
public class PartNumberMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private Text partString = new Text();
private final static IntWritable count = new IntWritable(1);
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
String[] a = l.split(",");
if (a[1].equals("")) {
a[1] = "0";
l = StringUtils.join(",", a);
}
partString.set(l);
output.collect(partString, count);
}
}

Mapreduce MultipleOutputs error

I want to store output of a mapreduce job in two different directories.
Eventhough my code is designed to store the same output in different directories.
My Driver class code below
public class WordCountMain {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job myhadoopJob = new Job(conf);
myhadoopJob.setJarByClass(WordCountMain.class);
myhadoopJob.setJobName("WORD COUNT JOB");
FileInputFormat.addInputPath(myhadoopJob, new Path(args[0]));
myhadoopJob.setMapperClass(WordCountMapper.class);
myhadoopJob.setReducerClass(WordCountReducer.class);
myhadoopJob.setInputFormatClass(TextInputFormat.class);
myhadoopJob.setOutputFormatClass(TextOutputFormat.class);
myhadoopJob.setMapOutputKeyClass(Text.class);
myhadoopJob.setMapOutputValueClass(IntWritable.class);
myhadoopJob.setOutputKeyClass(Text.class);
myhadoopJob.setOutputValueClass(IntWritable.class);
MultipleOutputs.addNamedOutput(myhadoopJob, "output1", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(myhadoopJob, "output2", TextOutputFormat.class, Text.class, IntWritable.class);
FileOutputFormat.setOutputPath(myhadoopJob, new Path(args[1]));
System.exit(myhadoopJob.waitForCompletion(true) ? 0 : 1);
}
}
My Mapper Code
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String line = value.toString();
String word =null;
StringTokenizer st = new StringTokenizer(line,",");
while(st.hasMoreTokens())
{
word= st.nextToken();
context.write(new Text(word), new IntWritable(1));
}
}
}
My Reducer Code is below
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
MultipleOutputs mout =null;
protected void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {
int count=0;
int num =0;
Iterator<IntWritable> ie =values.iterator();
while(ie.hasNext())
{
num = ie.next().get();//1
count= count+num;
}
mout.write("output1", key, new IntWritable(count));
mout.write("output2", key, new IntWritable(count));
#Override
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
mout = new MultipleOutputs<Text, IntWritable>(context);
}
}
#Override
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException {
super.setup(context);
mout = new MultipleOutputs<Text, IntWritable>(context);
}
}
I am simply giving the output directories in reduce method itself
But when I run this mapreduce job using the below command, it does nothing. Even Mapreduce is not at all started. just a blank and stays idle.
hadoop jar WordCountMain.jar /user/cloudera/inputfiles/words.txt /user/cloudera/outputfiles/mapreduce/multipleoutputs
Could someone explain me what went wrong and how do I correct this with my code
Actually what happens is two output files with different name are stored inside /user/cloudera/outputfiles/mapreduce/multipleoutputs.
but what I need is storing output files in different directories.
In pig we can use by two STORE statement by giving different directories
How do I achieve the same in mapreduce
Can you try closing multiple output object in cleanup method for Reducer.

What will happen if we skip reducer by keeping mapper and combiner in Mapreduce

My input file that is of size 10 GB is at
/user/cloudera/inputfiles/records.txt
Here is my Driver class code :
public class WordCountMain {
/**
* #param args
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Path inputFilePath = new Path(args[0]);
Path outputFilePath = new Path(args[1]);
Job job = new Job(conf,"word count");
job.getConfiguration().set("mapred.job.queue.name","omega");
job.setJarByClass(WordCountMain.class);
FileInputFormat.addInputPath(job, inputFilePath);
FileOutputFormat.setOutputPath(job, outputFilePath);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountCombiner.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I have code for Mapper and Combiner ,I have set reducer to zero
Here is my Mapper code :
public class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>
{
public static IntWritable one = new IntWritable(1);
protected void map(Object key, Text value, Context context) throws java.io.IOException,java.lang.InterruptedException
{
String line = value.toString();
String eachWord =null;
StringTokenizer st = new StringTokenizer(line,"|");
while(st.hasMoreTokens())
{
eachWord = st.nextToken();
context.write(new Text(eachWord), one);
}
}
}
I have written my Own Combiner
Here is my Combiner Code :
public class WordCountCombiner extends Reducer<Text ,IntWritable,Text,IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws java.io.IOException, java.lang.InterruptedException
{
int count =0;
for(IntWritable i : values)
{
count =count+i.get();
}
context.write(key, new IntWritable(count));
}
}
My Question here is What output will it get stored .
The Output of Mapper or output of combiner?
Or Combiner will get executed only if there is reducer phase written?
Pls help
You cannot be sure how many times the combiner function will run or if at all it will run. Also running the combiner is not dependent on if you specify reducer for your job. In your case it will simply produce 160 output files (10240/64=160)
By skipping the setting of mapper and reducer, the hadoop will move forward with its default mapping.
For example, it will use
IdentityMapper.class as a default mapper.
The default input format is TextInputFormat.
The default partitioner is HashPartitione.
By default, there is a single reducer, and therefore a single partition.
The default reducer is Reducer, again a generic type.
The default output format is TextOutputFormat, which writes out records, one per line, by converting keys and values to strings and separating them with a tab character

MapReduce job with mixed data sources: HBase table and HDFS files

I need to implement a MR job which access data from both HBase table and HDFS files. E.g., mapper reads data from HBase table and from HDFS files, these data share the same primary key but have different schema. A reducer then join all columns (from HBase table and HDFS files) together.
I tried look online and could not find a way to run MR job with such mixed data source. MultipleInputs seem only work for multiple HDFS data sources. Please let me know if you have some ideas. Sample code would be great.
After a few days of investigation (and get help from HBase user mailing list), I finally figured out how to do it. Here is the source code:
public class MixMR {
public static class Map extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] sa = s.split(",");
if (sa.length == 2) {
context.write(new Text(sa[0]), new Text(sa[1]));
}
}
}
public static class TableMap extends TableMapper<Text, Text> {
public static final byte[] CF = "cf".getBytes();
public static final byte[] ATTR1 = "c1".getBytes();
public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
String key = Bytes.toString(row.get());
String val = new String(value.getValue(CF, ATTR1));
context.write(new Text(key), new Text(val));
}
}
public static class Reduce extends Reducer <Object, Text, Object, Text> {
public void reduce(Object key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String ks = key.toString();
for (Text val : values){
context.write(new Text(ks), val);
}
}
}
public static void main(String[] args) throws Exception {
Path inputPath1 = new Path(args[0]);
Path inputPath2 = new Path(args[1]);
Path outputPath = new Path(args[2]);
String tableName = "test";
Configuration config = HBaseConfiguration.create();
Job job = new Job(config, "ExampleRead");
job.setJarByClass(MixMR.class); // class that contains mapper
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs
scan.addFamily(Bytes.toBytes("cf"));
TableMapReduceUtil.initTableMapperJob(
tableName, // input HBase table name
scan, // Scan instance to control CF and attribute selection
TableMap.class, // mapper
Text.class, // mapper output key
Text.class, // mapper output value
job);
job.setReducerClass(Reduce.class); // reducer class
job.setOutputFormatClass(TextOutputFormat.class);
// inputPath1 here has no effect for HBase table
MultipleInputs.addInputPath(job, inputPath1, TextInputFormat.class, Map.class);
MultipleInputs.addInputPath(job, inputPath2, TableInputFormat.class, TableMap.class);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
}
There is no OOTB feature that supports this. A possible workaround could be to Scan your HBase table and write the Results to a HDFS file first and then do the reduce-side join using MultipleInputs. But this will incur some additional I/O overhead.
A pig script or hive query can do that easily.
sample pig script
tbl = LOAD 'hbase://SampleTable'
USING org.apache.pig.backend.hadoop.hbase.HBaseStorage(
'info:* ...', '-loadKey true -limit 5')
AS (id:bytearray, info_map:map[],...);
fle = LOAD '/somefile' USING PigStorage(',') AS (id:bytearray,...);
Joined = JOIN A tbl by id,fle by id;
STORE Joined to ...

Best way to split log files

Need help and this seems like such a common task to do:
We have hourly huge logfiles containing many different events.
We have been using hive to split these events to different files, in a hard coded way:
from events
insert overwrite table specificevent1
where events.event_type='specificevent1'
insert overwrite table specificevent2
where events.event_type='specificevent2'
...;
This is problematic as the code must change for each new event that we add.
We try to use dynamic partitioning to do an automatic parsing but experiencing problems:
If my partition schema is /year/month/day/hour/event then we cannot recover partitions of more than a day as the number for monthly will be ~ (30 days)(24 hours)(100~ events)=~72k which is way too many to work with.
If my schema is event/year/month/day/hour then since the event is the dynamic part it forces the next partitions to be scripted as dynamic, and this causes the splitting to take more time as number of partitions grow.
Is there a better way to do this (Hive and non-Hive solutions)?
Hope this will help others...
I found that Hive is not the way to go if you want to split a logfile to many different files (file per event_type).
Dynamic partitions offered by Hive have too many limitations IMHO.
What I ended up doing is writing a custom map-reduce jar.
I also found the old Hadoop interface much more suitable as it offers the MultipleTextOutputFormat abstract class which lets you implement the generateFileNameForKeyValue(). (New hadoop offers a different multiple output file mechanism: MultipleOutputs which is great if you have predefined output locations, did not get how to have them on the fly from key-value)
example code:
\*
Run example:
hadoop jar DynamicSplit.jar DynamicEventSplit.DynamicEventSplitMultifileMapReduce /event/US/incoming/2013-01-01-01/ event US 2013-01-01-01 2 "[a-zA-Z0-9_ ]+" "/event/dynamicsplit1/" ","
*/
package DynamicEventSplit;
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.lib.*;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
public class DynamicEventSplitMultifileMapReduce
{
static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
private String event_name;
private String EventNameRegexp;
private int EventNameColumnNumber;
private String columndelimeter=",";
public void configure(JobConf job)
{
EventNameRegexp=job.get("EventNameRegexp");
EventNameColumnNumber=Integer.parseInt(job.get("EventNameColumnNumber"));
columndelimeter=job.get("columndelimeter");
}
public void map(LongWritable key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
//check that expected event_name field exists
String [] dall=value.toString().split(columndelimeter);
if (dall.length<EventNameColumnNumber)
{
return;
}
event_name=dall[EventNameColumnNumber-1];
//check that expected event_name is valid
if (!event_name.matches(EventNameRegexp))
{
return;
}
output.collect(new Text(dall[1]),value);
}
}
static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterator<Text> values,OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
while (values.hasNext())
{
output.collect(key, values.next());
}
}
}
static class MultiFileOutput extends MultipleTextOutputFormat<Text, Text>
{
private String event_name;
private String site;
private String event_date;
private String year;
private String month;
private String day;
private String hour;
private String basepath;
public RecordWriter<Text,Text> getRecordWriter(FileSystem fs, JobConf job,String name, Progressable arg3) throws IOException
{
RecordWriter<Text,Text> rw=super.getRecordWriter(fs, job, name, arg3);
site=job.get("site");
event_date=job.get("date");
year=event_date.substring(0,4);
month=event_date.substring(5,7);
day=event_date.substring(8,10);
hour=event_date.substring(11,13);
basepath=job.get("basepath");
return rw;
}
protected String generateFileNameForKeyValue(Text key, Text value,String leaf)
{
event_name=key.toString();
return basepath+"event="+event_name+"/site="+site+"/year="+year+"/month="+month+"/day="+day+"/hour="+hour+"/"+leaf;
}
protected Text generateActualKey(Text key, Text value)
{
return null;
}
}
public static void main(String[] args) throws Exception
{
String InputFiles=args[0];
String OutputDir=args[1];
String SiteStr=args[2];
String DateStr=args[3];
String EventNameColumnNumber=args[4];
String EventNameRegexp=args[5];
String basepath=args[6];
String columndelimeter=args[7];
Configuration mycon=new Configuration();
JobConf conf = new JobConf(mycon,DynamicEventSplitMultifileMapReduce.class);
conf.set("site",SiteStr);
conf.set("date",DateStr);
conf.setOutputKeyClass(Text.class);
conf.setMapOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(MultiFileOutput.class);
conf.setMapSpeculativeExecution(false);
conf.setReduceSpeculativeExecution(false);
FileInputFormat.setInputPaths(conf,InputFiles);
FileOutputFormat.setOutputPath(conf,new Path("/"+OutputDir+SiteStr+DateStr+"/"));
conf.set("EventNameColumnNumber",EventNameColumnNumber);
conf.set("EventNameRegexp",EventNameRegexp);
conf.set("basepath",basepath);
conf.set("columndelimeter",columndelimeter);
JobClient.runJob(conf);
}
}