Need insights about writing a pass - llvm

For my source code, I have the following IR:
; ModuleID = '<stdin>'
#.str = private unnamed_addr constant [9 x i8] c"SOME_ENV_VAR\00", align 1
#.str1 = private unnamed_addr constant [26 x i8] c"Need to set $ENV_Variable.\0A\00", align 1
; Function Attrs: nounwind
define void #foo(i8* %bar) #0 {
entry:
%bar.addr = alloca i8*, align 4
%baz = alloca i8*, align 4
store i8* %bar, i8** %bar.addr, align 4
%call = call i8* #getenv(i8* getelementptr inbounds ([9 x i8]* #.str, i32 0, i32 0)) #2
store i8* %call, i8** %baz, align 4
%0 = load i8** %baz, align 4
%cmp = icmp eq i8* %0, null
br i1 %cmp, label %if.then, label %if.else
if.then: ; preds = %entry
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([26 x i8]* #.str1, i32 0, i32 0))
br label %if.end
if.else: ; preds = %entry
%1 = load i8** %bar.addr, align 4
%2 = load i8** %baz, align 4
%call2 = call i8* #strcpy(i8* %1, i8* %2) #2
br label %if.end
if.end: ; preds = %if.else, %if.then
ret void
}
; Function Attrs: nounwind
declare i8* #getenv(i8*) #0
declare i32 #printf(i8*, ...) #1
; Function Attrs: nounwind
declare i8* #strcpy(i8*, i8*) #0
I intend to write a pass, which when compiled (using LLVM), produces bitcode where the call to strcpy(dest,src) is replaced with strncpy(dest,src,n).
I've written the following code so far:
#include <stdlib.h>
#include <stdio.h>
#include "llvm/Pass.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/IR/Module.h"
#include "llvm/PassManager.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/Assembly/PrintModulePass.h"
#include "llvm/IR/IRBuilder.h"
using namespace llvm;
namespace
{
Module* makeLLVMModule() {
Module* mod = new Module(llvm::StringRef("CustomPass"),getGlobalContext());
Constant* c = mod->getOrInsertFunction(llvm::StringRef("foo"),Type::getInt32Ty(getGlobalContext()),NULL);
Function* foo = cast<Function>(c);
Function::arg_iterator args =foo->arg_begin();
Value* bar = args++;
BasicBlock* Entry = BasicBlock::Create(getGlobalContext(),llvm::Twine("Entry"), foo);
BasicBlock* False = BasicBlock::Create(getGlobalContext(),llvm::Twine("False"), foo);
BasicBlock* True = BasicBlock::Create(getGlobalContext(),llvm::Twine("True"), foo);
char* pPath;
pPath = getenv("SOME_ENV_VAR");
IRBuilder<> builder(Entry);
Value* envVarDoesntExist = builder.CreateICmpEQ(llvm::StringRef(pPath),Constant::getNullValue(Value),llvm::Twine("temp"));
//---1
builder.CreateCondBr(envVarDoesntExist, False, True);
builder.SetInsertPoint(True);
builder.CreateCall3(strncpy,bar,llvm::StringRef(pPath),45,llvm::Twine("temp"));
//---2
builder.SetInsertPoint(False);
builder.CreateCall(printf,llvm::StringRef("Need to set $ENV_Variable.\n"),llvm::Twine("temp"));
//---1
return mod;
}
}
char funcP::ID = 0;
static RegisterPass<funcP> X("funcp", "funcP", false, false);
From ---1:How to convert llvm::StringRef to Value* ?
From ---2:How to convert char* to Value*
Could Constant::getNullValue(Value) be used for getting a NULL value?

I intend to write a pass, which when compiled (using LLVM), produces bitcode where the call to strcpy(dest,src) is replaced with strncpy(dest,src,n).
Then what you need to do is to locate the call instruction and change it. There's no need to recreate the entire flow, it's already in your source code.
All you need to do is to create a function pass, iterate over all the instructions in the function, and if the instruction is a call instruction and the callee's name is strcpy then create a new call instruction to your new function, then replace the old instruction with the new instruction.
Also there seems to be some fundamental misunderstanding in your code between values in the compiler (such as 45 and all the StringRefs) and values in the code you are processing (instances of one of the subtypes of llvm::Value). Specifically, you can't just use 45 as a parameter to a function in the code you are processing - you have to create a constant int from that number, and then you can use that constant.
One final note - you can implicitly construct a StringRef from a const char*, you don't need to explicitly call the StringRef's constructor all over the place. Same with Twine.

Related

Cannot link custom generated LLVM IR with Clang generated IR

I've been trying to link IR generated with llvm's C++ api with a another IR file generated by Clang++. The input file to Clang is a function fn I'm trying to call from the first IR file. But llvm-link doesn't replace fn's declaration with its definition.
main_ir.ll
source_filename = "top"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#0 = private unnamed_addr constant [5 x i8] c"%d \0A\00", align 1
declare i32 #printf(...)
declare i32 #fn(i32, ...)
define internal i32 #main() {
entrypoint:
%f_call = call i32 (i32, ...) #fn(i32 2)
%printfCall = call i32 (...) #printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* #0,
i32 0, i32 0), i32 %f_call)
br label %ProgramExit
ProgramExit: ; preds = %entrypoint
ret i32 0
}
fn_ir.ll (generated with Clang)
source_filename = "libDessin.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #_Z2fni(i32) #0 {
%2 = alloca i32, align 4
store i32 %0, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = mul nsw i32 %3, 2
ret i32 %4
}
attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-
math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-
width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-
math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-
math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-
cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 9.0.1-12 "}
And all llvm-link does is reproduce the contents of fn_ir.ll with the source_filename changed to llvm-link. I'd be real happy to know the bit I'm missing.
The answer is in the name mangling.
Your 'manually' generated IR has a function named fn, while clang++ emits the name _Z2fni.
You need to make the names match. Either emit the _Z2fni in the main_ir.ll, or (arguable better in this case) change the definition of fn in the fn_ir, e.g.:
extern "C" void fn(int x) {
return x * 2;
}
extern "C" tells the compiler to use C mangling convention, this is less fragile since it will work even if you change type or number of arguments of fn. However, it won't work if you want to pass C++ types into the fn, then you need to emit the right function name for the main_ir.ll.
UPD:
There two more 'discrepancies':
The fn has different arguments in the two modules: i32 vs i32, ...
The other issue is that main declared as internal. I guess it is just stripped since it is internal and it is not being called by anyone.
So just removing the internal flag should do the job for you.

Identify annotated variable in an LLVM pass

How can I identify an annotated variable in an LLVM pass?
#include <stdio.h>
int main (){
int x __attribute__((annotate("my_var")))= 0;
int a,b;
x = x + 1;
a = 5;
b = 6;
x = x + a;
return x;
}
For example, I want to identify the instructions which have the annotated variable (x in this case) and print them out (x = x+1; and x = x+a)
How can I achieve this?
This is the .ll file generated using LLVM
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
#.str = private unnamed_addr constant [7 x i8] c"my_var\00", section "llvm.metadata"
#.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata"
; Function Attrs: noinline nounwind optnone
define i32 #main() #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 0, i32* %1, align 4
%5 = bitcast i32* %2 to i8*
call void #llvm.var.annotation(i8* %5, i8* getelementptr inbounds ([7 x i8], [7 x i8]* #.s$
store i32 0, i32* %2, align 4
%6 = load i32, i32* %2, align 4
%7 = add nsw i32 %6, 1
store i32 %7, i32* %2, align 4
store i32 5, i32* %3, align 4
store i32 6, i32* %4, align 4
%8 = load i32, i32* %2, align 4
%9 = load i32, i32* %3, align 4
%10 = add nsw i32 %8, %9
store i32 %10, i32* %2, align 4
%11 = load i32, i32* %2, align 4
ret i32 %11
}
; Function Attrs: nounwind
declare void #llvm.var.annotation(i8*, i8*, i8*, i32) #1
attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" $
attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
I recently encountered similiary problem, as I searched Google still not found a solution.
But in the end , I found "ollvm" project's Utils.cpp ,it solved my problem.
In your case,
%5 = bitcast i32* %2 to i8*
call void #llvm.var.annotation(i8* %5, i8* getelementptr inbounds ([7 x i8], [7 x i8]* #.s$
as we can see there is a call to #llvm.var.annotation , in our pass ,
we can loop through instructions over a function , and search for "call" instruction.
Then get the called function's name:
Function *fn = callInst->getCalledFunction();
StringRef fn_name = fn->getName();
and compare the called function's name with "llvm.var.annotation" .
If they match ,then we found the location of "int x " in your case .
The function "llvm.var.annotation" is documented in llvm's doc :
http://llvm.org/docs/LangRef.html#llvm-var-annotation-intrinsic
If you have learn the function "llvm.var.annotation"'s prototype,
then you know that it's second argument is a pointer ,the pointer
points to "my_var\00" in your case . If you thought you can simply
convert it to a GlobalVariable ,then you will failed to get what
you wanted . The actual second argument passed to "llvm.var.annotation"
is
i8* getelementptr inbounds ([7 x i8], [7 x i8]* #.s$
in your case.
It's a expression but a GlobalVariable !!! By knowing this , we can
finally get the annotation of our target variable by :
ConstantExpr *ce =
cast<ConstantExpr>(callInst->getOperand(1));
if (ce) {
if (ce->getOpcode() == Instruction::GetElementPtr) {
if (GlobalVariable *annoteStr =
dyn_cast<GlobalVariable>(ce->getOperand(0))) {
if (ConstantDataSequential *data =
dyn_cast<ConstantDataSequential>(
annoteStr->getInitializer())) {
if (data->isString()) {
errs() << "Found data " << data->getAsString();
}
}
}
}
Hope you already solved the problem .
Have a nice day .
You have to loop on instructions and identify calls to llvm.var.annotation
First argument is a pointer to the annotated variable (i8*).
To get the actual annotated variable, you then need to find what this pointer points to.
In your case, this is the source operand of the bitcast instruction.

clang-4.0 generates redundant methods when initializing global variables

I'm learning LLVM these days via observing how clang deal with complex situations. I wrote (top level, not in a function):
int qaq = 666;
int tat = 233;
auto hh = qaq + tat;
And I use the command:
clang-4.0 003.cpp -emit-llvm -S -std=c++11
And clang generates codes like this:
#qaq = global i32 666, align 4
#tat = global i32 233, align 4
#hh = global i32 0, align 4
#llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* #_GLOBAL__sub_I_003.cpp, i8* null }]
; Function Attrs: noinline uwtable
define internal void #__cxx_global_var_init() #0 section ".text.startup" {
%1 = load i32, i32* #qaq, align 4
%2 = load i32, i32* #tat, align 4
%3 = add nsw i32 %1, %2
store i32 %3, i32* #hh, align 4
ret void
}
; Function Attrs: noinline uwtable
define internal void #_GLOBAL__sub_I_003.cpp() #0 section ".text.startup" {
call void #__cxx_global_var_init()
ret void
}
I'm confused with _GLOBAL__sub_I_003.cpp: why does clang generate a function that actually only invoke another function (and not doing anything else)? Even both of them have no parameters?
Disclaimer: This is my interpretation of the logic, I'm not part of the LLVM team.
In order to understand the reasoning behind this, you have to understand a fundamental concept in software engineering: Complexity creates bugs, and makes testing harder.
But first, let's make your example a little more interesting:
int qaq = 666;
int tat = 233;
auto hh = qaq + tat;
auto ii = qaq - tat;
Which leads to:
; Function Attrs: noinline uwtable
define internal void #__cxx_global_var_init() #0 section ".text.startup" !dbg !16 {
%1 = load i32, i32* #qaq, align 4, !dbg !19
%2 = load i32, i32* #tat, align 4, !dbg !20
%3 = add nsw i32 %1, %2, !dbg !21
store i32 %3, i32* #hh, align 4, !dbg !21
ret void, !dbg !20
}
; Function Attrs: noinline uwtable
define internal void #__cxx_global_var_init.1() #0 section ".text.startup" !dbg !22 {
%1 = load i32, i32* #qaq, align 4, !dbg !23
%2 = load i32, i32* #tat, align 4, !dbg !24
%3 = sub nsw i32 %1, %2, !dbg !25
store i32 %3, i32* #ii, align 4, !dbg !25
ret void, !dbg !24
}
; Function Attrs: noinline uwtable
define internal void #_GLOBAL__sub_I_example.cpp() #0 section ".text.startup" !dbg !26 {
call void #__cxx_global_var_init(), !dbg !28
call void #__cxx_global_var_init.1(), !dbg !29
ret void
}
So we see that CLANG emits a single function for each non-trivial initialization, and calls each of them one after the other in _GLOBAL__sub_I_example.cpp(). That makes sense and is sensible, as things are neatly organized this way, and could become a garbled mess in larger/more complicated files otherwise.
Notice how that's the exact same logic that is being applied in your example.
Doing otherwise would imply an algorithm of the type: "if there is a single non-trivial global initialization, then put the code directly in the translation unit's global constructor".
Note the following:
The current logic handles that case correctly already.
In optimized code, the end result would be the exact same.
So what would that logic get us, really?
More branches to test.
More opportunities to accidentaly insert a bug.
More code to maintain in the long run.
Removal of a single function call in the global initialization of some translation units in non-optimized builds.
Keeping things the way they are is just the right decision.

Delete complete branch from llvm ir

There is a branch in ir that I want to delete completely(condtion + branch + true_basic_block + false_basic_block). It looks like this:
%4 = icmp sge i32 %2, %3
br i1 %4, label %5, label %7
; <label>:5 ; preds = %0
%6 = load i32* %x, align 4
store i32 %6, i32* %z, align 4
br label %9
; <label>:7 ; preds = %0
%8 = load i32* %y, align 4
store i32 %8, i32* %z, align 4
br label %9
; <label>:9 ; preds = %7, %5
%10 = call dereferenceable(140) %"class.std::basic_ostream"*#_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc(%"class.std::basic_ostream"* dereferenceable(140) #_ZSt4cout, i8* getelementptr inbounds ([5 x i8]* #.str, i32 0, i32 0))
%11 = load i32* %z, align 4
%12 = call dereferenceable(140) %"class.std::basic_ostream"* #_ZNSolsEi(%"class.std::basic_ostream"* %10, i32 %11)
%13 = call dereferenceable(140) %"class.std::basic_ostream"* #_ZNSolsEPFRSoS_E(%"class.std::basic_ostream"* %12, %"class.std::basic_ostream"* (%"class.std::basic_ostream"*)* #_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_)
ret i32 0
Now to delete it , is there a removeBranch function , or do I need to delete instructions one by one. I have been trying the latter way but I have seen every error from "Basic block in main does not have an terminator" to "use remains when def is destroyed", and many more.. I have used erasefromparent, replaceinstwithvalue, replaceinstwithinst, removefromparent, etc.
Can anyone be kind enough to point me in the correct direction?
This is my function_pass :
bool runOnFunction(Function &F) override {
for (auto& B : F)
for (auto& I : B)
if(auto* brn = dyn_cast<BranchInst>(&I))
if(brn->isConditional()){
Instruction* cond = dyn_cast<Instruction>(brn->getCondition());
if(cond->getOpcode() == Instruction::ICmp){
branch_vector.push_back(brn);
//removeConditionalBranch(dyn_cast<BranchInst>(brn));
}
}
/*For now just delete the branches in the vector.*/
for(auto b : branch_vector)
removeConditionalBranch(dyn_cast<BranchInst>(b));
return true;
}
This is the output :
I don't know of any RemoveBranch utility function, but something like this should work. The idea is to delete the branch instruction, then delete anything that becomes dead as a result, and then merge the initial block with the join block.
// for DeleteDeadBlock, MergeBlockIntoPredecessor
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
// for RecursivelyDeleteTriviallyDeadInstructions
#include "llvm/Transforms/Utils/Local.h"
void removeConditionalBranch(BranchInst *Branch) {
assert(Branch &&
Branch->isConditional() &&
Branch->getNumSuccessors() == 2);
BasicBlock *Parent = Branch->getParent();
BasicBlock *ThenBlock = Branch->getSuccessor(0);
BasicBlock *ElseBlock = Branch->getSuccessor(1);
BasicBlock *ThenSuccessor = ThenBlock->getUniqueSuccessor();
BasicBlock *ElseSuccessor = ElseBlock->getUniqueSuccessor();
assert(ThenSuccessor && ElseSuccessor && ThenSuccessor == ElseSuccessor);
Branch->eraseFromParent();
RecursivelyDeleteTriviallyDeadInstructions(Branch->getCondition());
DeleteDeadBlock(ThenBlock);
DeleteDeadBlock(ElseBlock);
IRBuilder<> Builder(Parent);
Builder.CreateBr(ThenSuccessor);
bool Merged = MergeBlockIntoPredecessor(ThenSuccessor);
assert(Merged);
}
This code only handles the simple case you've shown, with the then and else blocks both jumping unconditionally to a common join block (it will fail with an assertion error for anything more complicated). More complicated control flow will be a bit trickier to handle, but you should still be able to use this code as a starting point.

Do modern C++ compilers inline functions which are called exactly once?

As in, say my header file is:
class A
{
void Complicated();
}
And my source file
void A::Complicated()
{
...really long function...
}
Can I split the source file into
void DoInitialStuff(pass necessary vars by ref or value)
{
...
}
void HandleCaseA(pass necessary vars by ref or value)
{
...
}
void HandleCaseB(pass necessary vars by ref or value)
{
...
}
void FinishUp(pass necessary vars by ref or value)
{
...
}
void A::Complicated()
{
...
DoInitialStuff(...);
switch ...
HandleCaseA(...)
HandleCaseB(...)
...
FinishUp(...)
}
Entirely for readability and without any fear of impact in terms of performance?
You should mark the functions static so that the compiler know they are local to that translation unit.
Without static the compiler cannot assume (barring LTO / WPA) that the function is only called once, so is less likely to inline it.
Demonstration using the LLVM Try Out page.
That said, code for readability first, micro-optimizations (and such tweaking is a micro-optimization) should only come after performance measures.
Example:
#include <cstdio>
static void foo(int i) {
int m = i % 3;
printf("%d %d", i, m);
}
int main(int argc, char* argv[]) {
for (int i = 0; i != argc; ++i) {
foo(i);
}
}
Produces with static:
; ModuleID = '/tmp/webcompile/_27689_0.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
#.str = private constant [6 x i8] c"%d %d\00" ; <[6 x i8]*> [#uses=1]
define i32 #main(i32 %argc, i8** nocapture %argv) nounwind {
entry:
%cmp4 = icmp eq i32 %argc, 0 ; <i1> [#uses=1]
br i1 %cmp4, label %for.end, label %for.body
for.body: ; preds = %for.body, %entry
%0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] ; <i32> [#uses=3]
%rem.i = srem i32 %0, 3 ; <i32> [#uses=1]
%call.i = tail call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([6 x i8]* #.str, i64 0, i64 0), i32 %0, i32 %rem.i) nounwind ; <i32> [#uses=0]
%inc = add nsw i32 %0, 1 ; <i32> [#uses=2]
%exitcond = icmp eq i32 %inc, %argc ; <i1> [#uses=1]
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret i32 0
}
declare i32 #printf(i8* nocapture, ...) nounwind
Without static:
; ModuleID = '/tmp/webcompile/_27859_0.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
#.str = private constant [6 x i8] c"%d %d\00" ; <[6 x i8]*> [#uses=1]
define void #foo(int)(i32 %i) nounwind {
entry:
%rem = srem i32 %i, 3 ; <i32> [#uses=1]
%call = tail call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([6 x i8]* #.str, i64 0, i64 0), i32 %i, i32 %rem) ; <i32> [#uses=0]
ret void
}
declare i32 #printf(i8* nocapture, ...) nounwind
define i32 #main(i32 %argc, i8** nocapture %argv) nounwind {
entry:
%cmp4 = icmp eq i32 %argc, 0 ; <i1> [#uses=1]
br i1 %cmp4, label %for.end, label %for.body
for.body: ; preds = %for.body, %entry
%0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] ; <i32> [#uses=3]
%rem.i = srem i32 %0, 3 ; <i32> [#uses=1]
%call.i = tail call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([6 x i8]* #.str, i64 0, i64 0), i32 %0, i32 %rem.i) nounwind ; <i32> [#uses=0]
%inc = add nsw i32 %0, 1 ; <i32> [#uses=2]
%exitcond = icmp eq i32 %inc, %argc ; <i1> [#uses=1]
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret i32 0
}
Depends on aliasing (pointers to that function) and function length (a large function inlined in a branch could throw the other branch out of cache, thus hurting performance).
Let the compiler worry about that, you worry about your code :)
A complicated function is likely to have its speed dominated by the operations within the function; the overhead of a function call won't be noticeable even if it isn't inlined.
You don't have much control over the inlining of a function, the best way to know is to try it and find out.
A compiler's optimizer might be more effective with shorter pieces of code, so you might find it getting faster even if it's not inlined.
If you split up your code into logical groupings the compiler will do what it deems best: If it's short and easy, the compiler should inline it and the result is the same. If however the code is complicated, making an extra function call might actually be faster than doing all the work inlined, so you leave the compiler the option to do that too. On top of all that, the logically split code can be far easier for a maintainer to grok and avoid future bugs.
I suggest you create a helper class to break your complicated function into method calls, much like you were proposing, but without the long, boring and unreadable task of passing arguments to each and every one of these smaller functions. Pass these arguments only once by making them member variables of the helper class.
Don't focus on optimization at this point, make sure your code is readable and you'll be fine 99% of the time.