Fastest way to ingest data from BigQuery to PubSub - google-cloud-platform

At the moment I am going through the GCP docs trying to figure out what is the optimal/fastest way to ingest data from BigQuery (using Python) to PubSub. What I am doing so far (in a simplified way) is:
MESSAGE_SIZE_IN_BYTES = 500
MAX_BATCH_MESSAGES = 20
MAX_BYTES_BATCH = MESSAGE_SIZE_IN_BYTES * MAX_BATCH_MESSAGES
BATCH_MAX_LATENCY_IN_10MS = 0.01
MAX_FLOW_MESSAGES = 20
MAX_FLOW_BYTES = MESSAGE_SIZE_IN_BYTES * MAX_FLOW_MESSAGES
batch_settings = pubsub_v1.types.BatchSettings(
max_messages=MAX_BATCH_MESSAGES,
max_bytes=MAX_BYTES_BATCH,
max_latency=BATCH_MAX_LATENCY_IN_10MS,
)
publisher_options = pubsub_v1.types.PublisherOptions(
flow_control=pubsub_v1.types.PublishFlowControl(
message_limit=MAX_FLOW_MESSAGES,
byte_limit=MAX_FLOW_BYTES,
limit_exceeded_behavior=pubsub_v1.types.LimitExceededBehavior.BLOCK,
),
)
pubsub_client = pubsub_v1.PublisherClient(credentials=credentials,
batch_settings=self.batch_settings,
publisher_options=self.publisher_options)
bigquery_client = ....
bq_query_job = bigquery_client.query(QUERY)
rows = bq_query_job.result()
for row in rows:
callback_obj = PubsubCallback(...)
json_data = json.dumps(row).encode("utf-8")
publish_future = pubsub_client.publish(topic_path, json_data)
publish_future.add_done_callback(callback_obj.callback)
publish_futures.append(publish_future)
so one message per row. I have being trying to tweak different params for the PubSub publisher client etc, but I cannot get further than 20/30 messages(rows) per second. Is there a way to read from BigQuery using Pubsub in a faster way (at least 1000 times faster than now)?

We also have a need to get data from BigQuery into PubSub and we do so using Dataflow. I've just looked at one of the jobs we ran today and we loaded 3.4million rows in about 5 minutes (so ~11000 rows per second).
Our Dataflow jobs are written in java but you could write them in python if you wish. Here is the code for the pipeline I described above:
package com.ourcompany.pipelines;
import com.google.api.services.bigquery.model.TableRow;
import java.util.HashMap;
import java.util.Map;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.Validation.Required;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The {#code BigQueryEventReplayer} pipeline runs a supplied SQL query
* against BigQuery, and sends the results one-by-one to PubSub
* The query MUST return a column named 'json', it is this column
* (and ONLY this column) that will be sent onward. The column must be a String type
* and should be valid JSON.
*/
public class BigQueryEventReplayer {
private static final Logger logger = LoggerFactory.getLogger(BigQueryEventReplayer.class);
/**
* Options for the BigQueryEventReplayer. See descriptions for more info
*/
public interface Options extends PipelineOptions {
#Description("SQL query to be run."
+ "An SQL string literal which will be run 'as is'")
#Required
ValueProvider<String> getBigQuerySql();
void setBigQuerySql(ValueProvider<String> value);
#Description("The name of the topic which data should be published to. "
+ "The name should be in the format of projects/<project-id>/topics/<topic-name>.")
#Required
ValueProvider<String> getOutputTopic();
void setOutputTopic(ValueProvider<String> value);
#Description("The ID of the BigQuery dataset targeted by the event")
#Required
ValueProvider<String> getBigQueryTargetDataset();
void setBigQueryTargetDataset(ValueProvider<String> value);
#Description("The ID of the BigQuery table targeted by the event")
#Required
ValueProvider<String> getBigQueryTargetTable();
void setBigQueryTargetTable(ValueProvider<String> value);
#Description("The SourceSystem attribute of the event")
#Required
ValueProvider<String> getSourceSystem();
void setSourceSystem(ValueProvider<String> value);
}
/**
* Takes the data from the TableRow and prepares it for the PubSub, including
* adding attributes to ensure the payload is routed correctly.
*/
public static class MapQueryToPubsub extends DoFn<TableRow, PubsubMessage> {
private final ValueProvider<String> targetDataset;
private final ValueProvider<String> targetTable;
private final ValueProvider<String> sourceSystem;
MapQueryToPubsub(
ValueProvider<String> targetDataset,
ValueProvider<String> targetTable,
ValueProvider<String> sourceSystem) {
this.targetDataset = targetDataset;
this.targetTable = targetTable;
this.sourceSystem = sourceSystem;
}
/**
* Entry point of DoFn for Dataflow.
*/
#ProcessElement
public void processElement(ProcessContext c) {
TableRow row = c.element();
if (!row.containsKey("json")) {
logger.warn("table does not contain column named 'json'");
}
Map<String, String> attributes = new HashMap<>();
attributes.put("sourceSystem", sourceSystem.get());
attributes.put("targetDataset", targetDataset.get());
attributes.put("targetTable", targetTable.get());
String json = (String) row.get("json");
c.output(new PubsubMessage(json.getBytes(), attributes));
}
}
/**
* Run the pipeline. This is the entrypoint for running 'locally'
*/
public static void main(String[] args) {
// Parse the user options passed from the command-line
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
run(options);
}
/**
* Run the pipeline. This is the entrypoint that GCP will use
*/
public static PipelineResult run(Options options) {
Pipeline pipeline = Pipeline.create(options);
pipeline.apply("Read from BigQuery query",
BigQueryIO.readTableRows().fromQuery(options.getBigQuerySql()).usingStandardSql().withoutValidation()
.withTemplateCompatibility())
.apply("Map data to PubsubMessage",
ParDo.of(
new MapQueryToPubsub(
options.getBigQueryTargetDataset(),
options.getBigQueryTargetTable(),
options.getSourceSystem()
)
)
)
.apply("Write message to PubSub", PubsubIO.writeMessages().to(options.getOutputTopic()));
return pipeline.run();
}
}
This pipeline requires that each row retrieved from BigQuery is a JSON document, something that can easily be achieved using TO_JSON_STRING.
I know this might look rather daunting to some (it kinda does to me I admit) but it will get you the throughput that you require!
You can ignore this part:
Map<String, String> attributes = new HashMap<>();
attributes.put("sourceSystem", sourceSystem.get());
attributes.put("targetDataset", targetDataset.get());
attributes.put("targetTable", targetTable.get());
that's just some extra attributes we add to the pubsub message purely for our own use.

Use Pub/Sub Batch Messages. This allows your code to batch multiple messages into a single call to the Pub/Sub service.
Example code from Google (link):
from concurrent import futures
from google.cloud import pubsub_v1
# TODO(developer)
# project_id = "your-project-id"
# topic_id = "your-topic-id"
# Configure the batch to publish as soon as there are 10 messages
# or 1 KiB of data, or 1 second has passed.
batch_settings = pubsub_v1.types.BatchSettings(
max_messages=10, # default 100
max_bytes=1024, # default 1 MB
max_latency=1, # default 10 ms
)
publisher = pubsub_v1.PublisherClient(batch_settings)
topic_path = publisher.topic_path(project_id, topic_id)
publish_futures = []
# Resolve the publish future in a separate thread.
def callback(future: pubsub_v1.publisher.futures.Future) -> None:
message_id = future.result()
print(message_id)
for n in range(1, 10):
data_str = f"Message number {n}"
# Data must be a bytestring
data = data_str.encode("utf-8")
publish_future = publisher.publish(topic_path, data)
# Non-blocking. Allow the publisher client to batch multiple messages.
publish_future.add_done_callback(callback)
publish_futures.append(publish_future)
futures.wait(publish_futures, return_when=futures.ALL_COMPLETED)
print(f"Published messages with batch settings to {topic_path}.")

Related

Using Spanner within Apache Beam Dataflow

I am trying to add a Spanner connection within an Apache Beam ParDo(DoFn). I need to lookup some rows as part of the ParDo. The dataflow creates a number of workers (usually 4 max) and I use the startBundle and finishBundle methods to open and close the spanner connections for the workers lifetime. Then within the processElement method I perform the lookup for each item passing the DatabaseClient and using a singleUseReadOnlyTransaction.
I should add this is running as a dataflow under GCP
Some code to illustrate this.
private static CustomDoFn<String, TransactionImport> processRow = new CustomDoFn<String, TransactionImport>(){
private static final long serialVersionUID = 1L;
private Spanner spanner = null;
private DatabaseClient dbClient = null;
#StartBundle
public void startBundle(StartBundleContext c){
TransactionFileOptions options = c.getPipelineOptions().as(TransactionFileOptions.class);
com.google.cloud.spanner.SpannerOptions spannerOptions = com.google.cloud.spanner.SpannerOptions.newBuilder().build();
spanner = spannerOptions.getService();
String spannerProjectID = options.getSpannerProjectId();
String spannerInstanceID = options.getSpannerInstanceId();
String spannerDatabaseID = options.getSpannerDatabaseId();
DatabaseId db = DatabaseId.of(spannerProjectID, spannerInstanceID, spannerDatabaseID);
dbClient = spanner.getDatabaseClient(db);
}
#FinishBundle
public void finishBundle(FinishBundleContext c){
spanner.close();
}
#ProcessElement
public void processElement(DoFn<String, TransactionImport>.ProcessContext c) throws Exception {
TransactionImport import = new TransactionImport();
Statement statement = Statement.newBuilder("SELECT * FROM Table1 WHERE Name= #Name")
.bind("Name").to( text)
.build();
ResultSet resultSet = dbClient.singleUseReadOnlyTransaction().executeQuery(statement);
// set some value on import dependant on retrieved value
c.output(import);
}
This always results in the dataflow not completing and when I check the log I see:
Processing stuck in step Process Rows for at least 05m00s without outputting or completing in state process
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(SynchronousQueue.java:458)
at java.util.concurrent.SynchronousQueue$TransferStack.transfer(SynchronousQueue.java:362)
at java.util.concurrent.SynchronousQueue.take(SynchronousQueue.java:924)
at com.google.common.util.concurrent.Uninterruptibles.takeUninterruptibly(Uninterruptibles.java:233)
at com.google.cloud.spanner.SessionPool$Waiter.take(SessionPool.java:411)
at com.google.cloud.spanner.SessionPool$Waiter.access$3300(SessionPool.java:399)
at com.google.cloud.spanner.SessionPool.getReadSession(SessionPool.java:754)
at com.google.cloud.spanner.DatabaseClientImpl.singleUseReadOnlyTransaction(DatabaseClientImpl.java:52)
at com.mycompany.pt.SpannerDataAccess.getBinDetails(SpannerDataAccess.java:197)
at com.mycompany.pt.transactionFiles.TransactionFileDataflow$1.processLine(TransactionFileDataflow.java:411)
at com.mycompany.pt.transactionFiles.TransactionFileDataflow$1.processElement(TransactionFileDataflow.java:336)
at com.mycompany.pt.transactionFiles.TransactionFileDataflow$1$DoFnInvoker.invokeProcessElement(Unknown Source)
`
Does anyone have any experience using Spanner like this within a ParDo?
I'm not a spanner expert, but maybe I can help:
You should use #Setup/#Teardown to connect & disconnect from spanner. #{Start,Finish}Bundle gets called multiple times over the lifetime of a worker. See here for more details: https://beam.apache.org/documentation/execution-model/#bundling-and-persistence
Does your processElement method ever emit an element using
c.output(...)? If not, beam will think your pipeline is stuck

Unable to read my config text file(Column Names) from GCS in dataflow

I have one source CSV file (without header) as well as header config CSV file (contains only column names) in GCS. I also have static table in Bigquery. I want to load source file into static table by using column header mapping (config file).
I was tried with different approach earlier(I was maintain source file which contain header and data in same file and then tried to split header from source file then insert those data into Bigquery by using header column mapping. I noticed this approach is NOT possible because dataflow shuffle data into multiple worker node. so i dropped this approach.
The below code i have used hard coded column names. I am looking approach to read column names from external config file (I want to make my code as dynamic).
package com.coe.cog;
import java.io.BufferedReader;
import java.util.*;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
public class SampleTest {
private static final Logger LOG = LoggerFactory.getLogger(SampleTest.class);
public static TableReference getGCDSTableReference() {
TableReference ref = new TableReference();
ref.setProjectId("myownproject");
ref.setDatasetId("DS_Employee");
ref.setTableId("tLoad14");
return ref;
}
static class TransformToTable extends DoFn<String, TableRow> {
#ProcessElement
public void processElement(ProcessContext c) {
String csvSplitBy = ",";
String lineHeader = "ID,NAME,AGE,SEX"; // Hard code column name but i want to read these header from GCS file.
String[] colmnsHeader = lineHeader.split(csvSplitBy); //Only Header array
String[] split = c.element().split(csvSplitBy); //Data section
TableRow row = new TableRow();
for (int i = 0; i < split.length; i++) {
row.set(colmnsHeader[i], split[i]);
}
c.output(row);
// }
}
}
public interface MyOptions extends PipelineOptions {
/*
* Param
*
*/
}
public static void main(String[] args) {
MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class);
options.setTempLocation("gs://demo-bucket-data/temp");
Pipeline p = Pipeline.create(options);
PCollection<String> lines = p.apply("Read From Storage", TextIO.read().from("gs://demo-bucket-data/Demo/Test/SourceFile_WithOutHeader.csv"));
PCollection<TableRow> rows = lines.apply("Transform To Table",ParDo.of(new TransformToTable()));
rows.apply("Write To Table",BigQueryIO.writeTableRows().to(getGCDSTableReference())
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER));
p.run();
}
}
Source File:
1,John,25,M
2,Smith,30,M
3,Josephine,20,F
Config File (Headers only):
ID,NAME,AGE,SEX
You have a couple of options:
Use a Dataflow/Beam side input to read the config/header file into some sort of collection e.g. a a ArrayList. It will be available to all workers in the cluster. You can then use the side input to dynamically assign the schema to the BigQuery table using DynamicDestinations.
Before dropping into your Dataflow pipeline, call the GCS api directly to grab your config/header file, parse it and then it the results to setup your pipeline.
Using Beam's FileSystems API for reading config files from GCS, is another approach.
Advantages:
No need of additional dependencies, it's included with beam API.
Using GCP's client libraries can lead to dependency version issues.
We can use a beam's FileSystems API in any transforms.
Here is a snippet for reading files.
//filePath format: gs://bucket/file
public static String loadSchema(String filePath) {
MatchResult.Metadata metadata;
try {
metadata = FileSystems.matchSingleFileSpec(filePath); // searching
} catch (IOException e) {
throw new RuntimeException(e);
}
String schema;
try {
// reading file
schema = CharStreams.toString(
Channels.newReader(
FileSystems.open(metadata.resourceId()),
StandardCharsets.UTF_8.name()
)
);
} catch (IOException e) {
throw new RuntimeException(e);
}
// returning content as string. We can process it now.
return schema;
}
Disadvantages of Sideinput
File's orientation changes.
It's hard to parse multiline file like Json and others.
Side Input can work for single line static values.

Running MapReduce on Hbase Exported Table thorws Could not find a deserializer for the Value class: 'org.apache.hadoop.hbase.client.Result

I have taken the Hbase table backup using Hbase Export utility tool .
hbase org.apache.hadoop.hbase.mapreduce.Export "FinancialLineItem" "/project/fricadev/ESGTRF/EXPORT"
This has kicked in mapreduce and transferred all my table data into Output folder .
As per the document the file format will of the ouotput file is sequence file .
So i ran below code to extract my key and value from the file .
Now i want to run mapreduce to read the key value from the output file but getting below exception
java.lang.Exception: java.io.IOException: Could not find a
deserializer for the Value class:
'org.apache.hadoop.hbase.client.Result'. Please ensure that the
configuration 'io.serializations' is properly configured, if you're
using custom serialization.
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:406)
Caused by: java.io.IOException: Could not find a deserializer for the Value class: 'org.apache.hadoop.hbase.client.Result'. Please
ensure that the configuration 'io.serializations' is properly
configured, if you're using custom serialization.
at org.apache.hadoop.io.SequenceFile$Reader.init(SequenceFile.java:1964)
at org.apache.hadoop.io.SequenceFile$Reader.initialize(SequenceFile.java:1811)
at org.apache.hadoop.io.SequenceFile$Reader.(SequenceFile.java:1760)
at org.apache.hadoop.io.SequenceFile$Reader.(SequenceFile.java:1774)
at org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader.initialize(SequenceFileRecordReader.java:50)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:478)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:671)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
Here is my driver code
package SEQ;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SeqDriver extends Configured implements Tool
{
public static void main(String[] args) throws Exception{
int exitCode = ToolRunner.run(new SeqDriver(), args);
System.exit(exitCode);
}
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s needs two arguments files\n",
getClass().getSimpleName());
return -1;
}
String outputPath = args[1];
FileSystem hfs = FileSystem.get(getConf());
Job job = new Job();
job.setJarByClass(SeqDriver.class);
job.setJobName("SequenceFileReader");
HDFSUtil.removeHdfsSubDirIfExists(hfs, new Path(outputPath), true);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(Result.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(MySeqMapper.class);
job.setNumReduceTasks(0);
int returnValue = job.waitForCompletion(true) ? 0:1;
if(job.isSuccessful()) {
System.out.println("Job was successful");
} else if(!job.isSuccessful()) {
System.out.println("Job was not successful");
}
return returnValue;
}
}
Here is my mapper code
package SEQ;
import java.io.IOException;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MySeqMapper extends Mapper <ImmutableBytesWritable, Result, Text, Text>{
#Override
public void map(ImmutableBytesWritable row, Result value,Context context)
throws IOException, InterruptedException {
}
}
So i will answer my question
here is what was needed to make it work
Because we use HBase to store our data and this reducer outputs its result to HBase table, Hadoop is telling us that he doesn’t know how to serialize our data. That is why we need to help it. Inside setUp set the io.serializations variable
hbaseConf.setStrings("io.serializations", new String[]{hbaseConf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName()});

About GenericOptionsParser getRemainingArgs method

package com.ibm.dw61;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.ibm.dw61.MaxTempReducer;
import com.ibm.dw61.MaxTempMapper;
public class MaxMonthlyTemp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] programArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (programArgs.length != 2) {
System.err.println("Usage: MaxTemp <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Monthly Max Temp");
job.setJarByClass(MaxMonthlyTemp.class);
job.setMapperClass(MaxTempMapper.class);
job.setCombinerClass(MaxTempReducer.class);
job.setReducerClass(MaxTempReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(programArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(programArgs[1]));
// Submit the job and wait for it to finish.
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Questions :
1) This is a map-reduce code to extract max temperature for each month. The coder is trying to get non-generic options using the getRemainingArgs method. But the next line says if the number of non-generic options is not 2, that means there is an error and the program will immediately abort. I couldn’t figure out what is the coder’s logic here. Anyone kind enough to explain?
2) In another example Wordcount, the coder didn’t perform this step of getting non-generic options. So under what circumstances do we have to perform this step and testing whether the non-generic options numbers 2?
as you can see in the Hadoop API documentation, purpose of the method getRemainingArgs is to extract application-specific arguments , those that are not related to Hadoop framework. in this code, you should specify two arguments, first your input and then output, as you can see in the Usage

How to checkout and checkin any document outside alfresco using rest API?

I have created one Web Application using Servlets and JSP. Through that I have connected to alfresco repository. I am also able be to upload document in Alfresco and view document in external web application.
Now my requirement is, I have to give checkin and checkout option to those documents.
I found below rest apis for this purpuse.
But I am not getting how to use these apis in servlets to full-fill my requirment.
POST /alfresco/service/slingshot/doclib/action/cancel-checkout/site/{site}/{container}/{path}
POST /alfresco/service/slingshot/doclib/action/cancel-checkout/node/{store_type}/{store_id}/{id}
Can anyone please provide the simple steps or some piece of code to do this task?
Thanks in advance.
Please do not use the internal slingshot URLs for this. Instead, use OpenCMIS from Apache Chemistry. It will save you a lot of time and headaches and it is more portable to other repositories besides Alfresco.
The example below grabs an existing document by path, performs a checkout, then checks in a new major version of the plain text document.
package com.someco.cmis.examples;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.chemistry.opencmis.client.api.Document;
import org.apache.chemistry.opencmis.client.api.ObjectId;
import org.apache.chemistry.opencmis.client.api.Repository;
import org.apache.chemistry.opencmis.client.api.Session;
import org.apache.chemistry.opencmis.client.api.SessionFactory;
import org.apache.chemistry.opencmis.client.runtime.SessionFactoryImpl;
import org.apache.chemistry.opencmis.commons.SessionParameter;
import org.apache.chemistry.opencmis.commons.data.ContentStream;
import org.apache.chemistry.opencmis.commons.enums.BindingType;
public class CheckoutCheckinExample {
private String serviceUrl = "http://localhost:8080/alfresco/api/-default-/public/cmis/versions/1.1/atom"; // Uncomment for Atom Pub binding
private Session session = null;
public static void main(String[] args) {
CheckoutCheckinExample cce = new CheckoutCheckinExample();
cce.doExample();
}
public void doExample() {
Document doc = (Document) getSession().getObjectByPath("/test/test-plain-1.txt");
String fileName = doc.getName();
ObjectId pwcId = doc.checkOut(); // Checkout the document
Document pwc = (Document) getSession().getObject(pwcId); // Get the working copy
// Set up an updated content stream
String docText = "This is a new major version.";
byte[] content = docText.getBytes();
InputStream stream = new ByteArrayInputStream(content);
ContentStream contentStream = session.getObjectFactory().createContentStream(fileName, Long.valueOf(content.length), "text/plain", stream);
// Check in the working copy as a major version with a comment
ObjectId updatedId = pwc.checkIn(true, null, contentStream, "My new version comment");
doc = (Document) getSession().getObject(updatedId);
System.out.println("Doc is now version: " + doc.getProperty("cmis:versionLabel").getValueAsString());
}
public Session getSession() {
if (session == null) {
// default factory implementation
SessionFactory factory = SessionFactoryImpl.newInstance();
Map<String, String> parameter = new HashMap<String, String>();
// user credentials
parameter.put(SessionParameter.USER, "admin"); // <-- Replace
parameter.put(SessionParameter.PASSWORD, "admin"); // <-- Replace
// connection settings
parameter.put(SessionParameter.ATOMPUB_URL, this.serviceUrl); // Uncomment for Atom Pub binding
parameter.put(SessionParameter.BINDING_TYPE, BindingType.ATOMPUB.value()); // Uncomment for Atom Pub binding
List<Repository> repositories = factory.getRepositories(parameter);
this.session = repositories.get(0).createSession();
}
return this.session;
}
}
Note that on the version of Alfresco I tested with (5.1.e) the document must already have the versionable aspect applied for the version label to get incremented, otherwise the checkin will simply override the original.