Best way to split log files - mapreduce

Need help and this seems like such a common task to do:
We have hourly huge logfiles containing many different events.
We have been using hive to split these events to different files, in a hard coded way:
from events
insert overwrite table specificevent1
where events.event_type='specificevent1'
insert overwrite table specificevent2
where events.event_type='specificevent2'
...;
This is problematic as the code must change for each new event that we add.
We try to use dynamic partitioning to do an automatic parsing but experiencing problems:
If my partition schema is /year/month/day/hour/event then we cannot recover partitions of more than a day as the number for monthly will be ~ (30 days)(24 hours)(100~ events)=~72k which is way too many to work with.
If my schema is event/year/month/day/hour then since the event is the dynamic part it forces the next partitions to be scripted as dynamic, and this causes the splitting to take more time as number of partitions grow.
Is there a better way to do this (Hive and non-Hive solutions)?

Hope this will help others...
I found that Hive is not the way to go if you want to split a logfile to many different files (file per event_type).
Dynamic partitions offered by Hive have too many limitations IMHO.
What I ended up doing is writing a custom map-reduce jar.
I also found the old Hadoop interface much more suitable as it offers the MultipleTextOutputFormat abstract class which lets you implement the generateFileNameForKeyValue(). (New hadoop offers a different multiple output file mechanism: MultipleOutputs which is great if you have predefined output locations, did not get how to have them on the fly from key-value)
example code:
\*
Run example:
hadoop jar DynamicSplit.jar DynamicEventSplit.DynamicEventSplitMultifileMapReduce /event/US/incoming/2013-01-01-01/ event US 2013-01-01-01 2 "[a-zA-Z0-9_ ]+" "/event/dynamicsplit1/" ","
*/
package DynamicEventSplit;
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.lib.*;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
public class DynamicEventSplitMultifileMapReduce
{
static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
private String event_name;
private String EventNameRegexp;
private int EventNameColumnNumber;
private String columndelimeter=",";
public void configure(JobConf job)
{
EventNameRegexp=job.get("EventNameRegexp");
EventNameColumnNumber=Integer.parseInt(job.get("EventNameColumnNumber"));
columndelimeter=job.get("columndelimeter");
}
public void map(LongWritable key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
//check that expected event_name field exists
String [] dall=value.toString().split(columndelimeter);
if (dall.length<EventNameColumnNumber)
{
return;
}
event_name=dall[EventNameColumnNumber-1];
//check that expected event_name is valid
if (!event_name.matches(EventNameRegexp))
{
return;
}
output.collect(new Text(dall[1]),value);
}
}
static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterator<Text> values,OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
while (values.hasNext())
{
output.collect(key, values.next());
}
}
}
static class MultiFileOutput extends MultipleTextOutputFormat<Text, Text>
{
private String event_name;
private String site;
private String event_date;
private String year;
private String month;
private String day;
private String hour;
private String basepath;
public RecordWriter<Text,Text> getRecordWriter(FileSystem fs, JobConf job,String name, Progressable arg3) throws IOException
{
RecordWriter<Text,Text> rw=super.getRecordWriter(fs, job, name, arg3);
site=job.get("site");
event_date=job.get("date");
year=event_date.substring(0,4);
month=event_date.substring(5,7);
day=event_date.substring(8,10);
hour=event_date.substring(11,13);
basepath=job.get("basepath");
return rw;
}
protected String generateFileNameForKeyValue(Text key, Text value,String leaf)
{
event_name=key.toString();
return basepath+"event="+event_name+"/site="+site+"/year="+year+"/month="+month+"/day="+day+"/hour="+hour+"/"+leaf;
}
protected Text generateActualKey(Text key, Text value)
{
return null;
}
}
public static void main(String[] args) throws Exception
{
String InputFiles=args[0];
String OutputDir=args[1];
String SiteStr=args[2];
String DateStr=args[3];
String EventNameColumnNumber=args[4];
String EventNameRegexp=args[5];
String basepath=args[6];
String columndelimeter=args[7];
Configuration mycon=new Configuration();
JobConf conf = new JobConf(mycon,DynamicEventSplitMultifileMapReduce.class);
conf.set("site",SiteStr);
conf.set("date",DateStr);
conf.setOutputKeyClass(Text.class);
conf.setMapOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(MultiFileOutput.class);
conf.setMapSpeculativeExecution(false);
conf.setReduceSpeculativeExecution(false);
FileInputFormat.setInputPaths(conf,InputFiles);
FileOutputFormat.setOutputPath(conf,new Path("/"+OutputDir+SiteStr+DateStr+"/"));
conf.set("EventNameColumnNumber",EventNameColumnNumber);
conf.set("EventNameRegexp",EventNameRegexp);
conf.set("basepath",basepath);
conf.set("columndelimeter",columndelimeter);
JobClient.runJob(conf);
}
}

Related

String concatenation in mapper class of a MapReduce Program giving errors

In my mapper class I want to do a small manipulation to a string read from a file(as a line) and then send it over to the reducer to get a string count. The manipulation being replace null strings with 0. (the current replace & join part is failing my hadoop job)
Here is my code:
import java.io.BufferedReader;
import java.io.IOException;
.....
public class PartNumberMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private static Text partString = new Text("");
private final static IntWritable count = new IntWritable(1);
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
// Read line by line to bufferreader and output the (line,count) pair
BufferedReader bufReader = new BufferedReader(new StringReader(line));
String l=null;
while( (l=bufReader.readLine()) != null )
{
/**** This part is the problem ****/
String a[]=l.split(",");
if(a[1]==""){ // if a[1] i.e. second string is "" then set it to "0"
a[1]="0";
l = StringUtils.join(",", a); // join the string array to form a string
}
/**** problematic part ends ****/
partString.set(l);
output.collect(partString, count);
}
}
}
After this is run, the mapper just fails and doesn't post any errors.
[The code is run with yarn]
I am not sure what I am doing wrong, the same code worked without the string join part.
Could any of you explain what is wrong with the string replace/concat? Is there a better way to do it?
Here's a modified version of your Mapper class with a few changes:
Remove the BufferedReader, it seems redundant and isn't being closed
String equality should be .equals() and not ==
Declare a String array using String[] and not String a[]
Resulting in the following code:
public class PartNumberMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private Text partString = new Text();
private final static IntWritable count = new IntWritable(1);
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
String[] a = l.split(",");
if (a[1].equals("")) {
a[1] = "0";
l = StringUtils.join(",", a);
}
partString.set(l);
output.collect(partString, count);
}
}

Mapreduce output showing all records in same line

I have implemented a mapreduce operation for log file using amazon and hadoop with custom jar.
My output shows the correct keys and values, but all the records are being displayed in a single line. For example, given the following pairs:
<1387, 2>
<1388, 1>
This is what's printing:
1387 21388 1
This is what I'm expecting:
1387 2
1388 1
How can I fix this?
Cleaned up your code for you :)
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(LogAnalyzer.class);
conf.setJobName("Loganalyzer");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(LogAnalyzer.Map.class);
conf.setCombinerClass(LogAnalyzer.Reduce.class);
conf.setReducerClass(LogAnalyzer.Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.set("mapreduce.textoutputformat.separator", "--");
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = ((Text) value).toString();
Matcher matcher = p.matcher(line);
if (matcher.matches()) {
String timestamp = matcher.group(4);
minute.set(getMinuteBucket(timestamp));
output.collect(minute, ONE); //context.write(minute, one);
}
}
This isn't hadoop-streaming, it's just a normal java job. You should amend the tag on the question.
This looks okay to me, although you don't have the mapper inside a class, which I assume is a copy/paste omission.
With regards to the line endings. I don't suppose you are looking at the output on Windows? It could be a problem with unix/windows line endings. If you open up the file in sublime or another advanced text editor you can switch between unix and windows. See if that works.

How can I override the test method name that appears on the TestNG report?

How can I override the test name that appears on the TestNG report? I want to override the name that appears in the middle column (currently shows as the method name). Is this even possible?
I tried to do it like this, but it didn't work.
public class EchApiTest1 extends TestBase {
...
#BeforeTest
public void setUp() {
restClient = new RestClientPost();
this.setTestName( "ech: XXXXXX" );
}
And, the base class:
import org.testng.ITest;
public class TestBase implements ITest {
String testName = "";
#Override
public String getTestName() {
return this.testName;
}
public void setTestName( String name ) {
this.testName = name;
}
}
NOTE: The above code does work when I am viewing the report detail in the Jenkins TestNG plugin report, which shows the overridden test name as a string called "Instance Name:" at the beginning of the Reporter log output. Why, in this case, WHY does a "setTestName()" method alter a string labeled "Instance Name" in the report?
One answer I found had a suggestion like this but I don't know how to pass an ITestResult arg to a AfterMethod method:
#AfterMethod
public void setResultTestName( ITestResult result ) {
try {
BaseTestMethod bm = (BaseTestMethod)result.getMethod();
Field f = bm.getClass().getSuperclass().getDeclaredField("m_methodName");
f.setAccessible(true);
f.set( bm, bm.getMethodName() + "." + your_customized_name );
} catch ( Exception ex ) {
Reporter.log( "ex" + ex.getMessage() );
}
Thoughts?
Please find following code for set custom name of testcase in TestNG reports.
Following features are available in this code.
Dynamic execution on same test-case in multiple time
Set custom test-case name for reports
Set parallel execution of multiple test-cases execution
import java.lang.reflect.Field;
import org.testng.ITest;
import org.testng.ITestResult;
import org.testng.Reporter;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Factory;
import org.testng.annotations.Test;
import org.testng.internal.BaseTestMethod;
import com.test.data.ServiceProcessData;
public class ServiceTest implements ITest {
protected ServiceProcessData serviceProcessData;
protected String testCaseName = "";
#Test
public void executeServiceTest() {
System.out.println(this.serviceProcessData.toString());
}
#Factory(dataProvider = "processDataList")
public RiskServiceTest(ServiceProcessData serviceProcessData) {
this.serviceProcessData = serviceProcessData;
}
#DataProvider(name = "processDataList", parallel = true)
public static Object[] getProcessDataList() {
Object[] serviceProcessDataList = new Object[0];
//Set data in serviceProcessDataList
return serviceProcessDataList;
}
#Override
public String getTestName() {
this.testCaseName = "User custom testcase name";
// this.testCaseName = this.serviceProcessData.getTestCaseCustomName();
return this.testCaseName;
}
#AfterMethod(alwaysRun = true)
public void setResultTestName(ITestResult result) {
try {
BaseTestMethod baseTestMethod = (BaseTestMethod) result.getMethod();
Field f = baseTestMethod.getClass().getSuperclass().getDeclaredField("m_methodName");
f.setAccessible(true);
f.set(baseTestMethod, this.testCaseName);
} catch (Exception e) {
ErrorMessageHelper.getInstance().setErrorMessage(e);
Reporter.log("Exception : " + e.getMessage());
}
}}
Thanks
I found a "workaround" but I am hoping for a better answer. I want to be able to show this "test name" OR "instance name" value on the HTML report (not just within the Reporter.log output) and I am starting to think its not possible :
#Test(dataProvider = "restdata2")
public void testGetNameFromResponse( TestArguments testArgs ) {
this.setTestName( "ech: " + testArgs.getTestName() );
Reporter.log( getTestName() ); // this magic shows test name on report
....
With this workaround, the user can now identify which test it was by looking at the Reporter.log output but I still wish the name was more prominant.
I suspect the answer lies in writing a TestListenerAdapter that somehow overrides the ITestResult.getTestNameMethod() method? That is the holy grail I am looking for.
The ‘result’ object will automatically pass in the method setResultTestName( ITestResult result )
Make sure you put alwaysRun=true like the following when you have groups defined in your test class otherwise “AfterMethod” will not be excuted.
#AfterMethod (alwaysRun=true)

MapReduce job with mixed data sources: HBase table and HDFS files

I need to implement a MR job which access data from both HBase table and HDFS files. E.g., mapper reads data from HBase table and from HDFS files, these data share the same primary key but have different schema. A reducer then join all columns (from HBase table and HDFS files) together.
I tried look online and could not find a way to run MR job with such mixed data source. MultipleInputs seem only work for multiple HDFS data sources. Please let me know if you have some ideas. Sample code would be great.
After a few days of investigation (and get help from HBase user mailing list), I finally figured out how to do it. Here is the source code:
public class MixMR {
public static class Map extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] sa = s.split(",");
if (sa.length == 2) {
context.write(new Text(sa[0]), new Text(sa[1]));
}
}
}
public static class TableMap extends TableMapper<Text, Text> {
public static final byte[] CF = "cf".getBytes();
public static final byte[] ATTR1 = "c1".getBytes();
public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
String key = Bytes.toString(row.get());
String val = new String(value.getValue(CF, ATTR1));
context.write(new Text(key), new Text(val));
}
}
public static class Reduce extends Reducer <Object, Text, Object, Text> {
public void reduce(Object key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String ks = key.toString();
for (Text val : values){
context.write(new Text(ks), val);
}
}
}
public static void main(String[] args) throws Exception {
Path inputPath1 = new Path(args[0]);
Path inputPath2 = new Path(args[1]);
Path outputPath = new Path(args[2]);
String tableName = "test";
Configuration config = HBaseConfiguration.create();
Job job = new Job(config, "ExampleRead");
job.setJarByClass(MixMR.class); // class that contains mapper
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs
scan.addFamily(Bytes.toBytes("cf"));
TableMapReduceUtil.initTableMapperJob(
tableName, // input HBase table name
scan, // Scan instance to control CF and attribute selection
TableMap.class, // mapper
Text.class, // mapper output key
Text.class, // mapper output value
job);
job.setReducerClass(Reduce.class); // reducer class
job.setOutputFormatClass(TextOutputFormat.class);
// inputPath1 here has no effect for HBase table
MultipleInputs.addInputPath(job, inputPath1, TextInputFormat.class, Map.class);
MultipleInputs.addInputPath(job, inputPath2, TableInputFormat.class, TableMap.class);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
}
There is no OOTB feature that supports this. A possible workaround could be to Scan your HBase table and write the Results to a HDFS file first and then do the reduce-side join using MultipleInputs. But this will incur some additional I/O overhead.
A pig script or hive query can do that easily.
sample pig script
tbl = LOAD 'hbase://SampleTable'
USING org.apache.pig.backend.hadoop.hbase.HBaseStorage(
'info:* ...', '-loadKey true -limit 5')
AS (id:bytearray, info_map:map[],...);
fle = LOAD '/somefile' USING PigStorage(',') AS (id:bytearray,...);
Joined = JOIN A tbl by id,fle by id;
STORE Joined to ...

ARFF output in weka is different depending on if it incrementaly saved

Below is a program that shows how strings are output incorrectly if the ARFF saver from weka is writing in incremental mode. The program below runs in incremental mode if a parameter is passed to the program and in batch mode if no parameter is passed.
Note that in batch mode, the ARFF file contains strings ... normal operation.
In incremental mode, the ARFF file contains integers in place of strings ... strange !
Any ideas on how to get the ARFF formater to output strings in incremental format?
import java.io.File;
import java.io.IOException;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
import weka.core.converters.Saver;
public class ArffTest {
static Instances instances;
static ArffSaver saver;
static boolean flag=false;
public static void addData(String ticker, double price) throws IOException{
int numAttr = instances.numAttributes(); // same for
double[] vals = new double[numAttr];
int i=0;
vals[i++] = instances.attribute(0).addStringValue(ticker);
vals[i++] = price;
Instance instance = new Instance(1.0, vals);
if (flag)
saver.writeIncremental(instance);
else
instances.add(instance);
}
public static void main(String[] args) {
if(args.length>0){
flag=true;
}
FastVector atts = new FastVector(); // attributes
atts.addElement(new Attribute("Ticker", (FastVector)null));// symbol
atts.addElement(new Attribute("Price")); // price that order exited at.
instances = new Instances("Samples", atts, 0); // create header
saver = new ArffSaver();
saver.setInstances(instances);
if(flag)
saver.setRetrieval(Saver.INCREMENTAL);
try{
saver.setFile(new File("test.arff"));
addData("YY", 23.0);
addData("XY", 24.0);
addData("XX", 29.0);
if(flag)
saver.writeIncremental(null);
else
saver.writeBatch();
}catch(Exception e){
System.out.println("Exception");
}
}
}
You forgot to add the newly created Instance to the dataset.
Instance instance = new DenseInstance(1.0, vals);
instance.setDataset(instances); //Add instance!
if (flag)
saver.writeIncremental(instance);
else
instances.add(instance);
The Instance must have access to the dataset to retrieve the String
attribute. If it doesn't it just writes out the index.
Besides that I recommend to use Weka 3.7.6. Instance is now an
interface with two implementations.
cheers,
Muki