Find index of string in large file performance - regex

I have a "container" containing data. The size is +- 100MB.
In the container there a several "dataids's" that mark the begin of something.
Now I need to get an index for an given dataid. (dataid for example: '4CFE7197-0029-006B-1AD4-000000000012')
I have tried several approaches. But at this moment "ReadAllBytes" is the most performant.
ReadAll -> average of 0.6 seconds
Using oReader As New BinaryReader(File.Open(sContainerPath, FileMode.Open, FileAccess.Read))
Dim iLength As Integer = CInt(oReader.BaseStream.Length)
Dim oValue As Byte() = Nothing
oValue = oReader.ReadBytes(iLength)
Dim enc As New System.Text.ASCIIEncoding
Dim sFileContent As String = enc.GetString(oValue)
Dim r As Regex = New Regex(sDataId)
Dim lPosArcID As Integer = r.Match(sFileContent).Index
If lPosArcID > 0 Then
Return lPosArcID
End If
End Using
ReadByteByByte -> average of 1.4 seconds
Using oReader As BinaryReader = New BinaryReader(File.Open(sContainerPath, FileMode.Open, FileAccess.Read))
Dim valueSearch As StringSearch = New StringSearch(sDataId)
Dim readByte As Byte
While (InlineAssignHelper(readByte, oReader.ReadByte()) >= 0)
index += 1
If valueSearch.Found(readByte) Then
Return index - iDataIdLength
End If
End While
End Using
Public Class StringSearch
Private ReadOnly oValue() As Byte
Private iValueIndex As Integer = -1
Public Sub New(value As String)
Dim oEncoding As New System.Text.ASCIIEncoding
Me.oValue = oEncoding.GetBytes(value)
End Sub
Public Function Found(oNextByte As Byte) As Boolean
If oValue(iValueIndex + 1) = oNextByte Then
iValueIndex += 1
If iValueIndex + 1 = oValue.Count Then Return True
Else
iValueIndex = -1
End If
Return False
End Function
End Class
Public Function InlineAssignHelper(Of T)(ByRef target As T, ByVal value As T) As T
target = value
Return value
End Function
I find it hard to believe that there is no faster way.
0.6 seconds for a 100MB file is not an acceptable time.
An other approach that I tried, is to split in chuncks of X bytes (100, 1000, ..). But was alot slower.
Any help on an approach I can try?

Related

How to populate a Treeview with multilevel interconnected refs docs

I can't find where I'm going wrong to make the structuring for the levels.
As the attached image is made with a file below it opens until the end. (file basically has single cascade)
The image on the right, on the other hand, contains a level above that has files contained in 3 different levels..
I believe I'm going around a lot to be able to structure according to the files.
structure
Public Class Form1
Public list As New List(Of Linha)
Public Lid As Integer = 0
Dim iPathDoc As String
Private MainSub()
Dim strDocName As String
strDocName = oApprenticeApp.FileManager.GetFullDocumentName(txtFileName.Text, 'Master')
oApprenticeServerDoc = oApprenticeApp.Open(strDocName)
iPathDoc = Microsoft.VisualBasic.Left(oApprenticeServerDoc.FullFileName, InStrRev(oApprenticeServerDoc.FullFileName, '\'))
BuildTreeView()
'Build child nodes
SubNivel()
End Sub
Sub SubNivel()
Dim Final As Boolean = False
'run while finish all list,
'bug ( some times where add a new item at list, its not include on final.. so list is not in sort(how create)
If list.Count <> 0 Then
Do While Final = False
For n = 0 To list.Count - 1
'search each line from list if there reg=false
If list(n).reg = False Then
'if false put = False
Final = False
'if false open
BuildTreeViewRef(list(n).pai, list(n).filho)
Else
'case the item = True
Final = True
End If
Next
Loop
End If
End Sub
Private Sub BuildTreeView()
'set main node
Dim conj As TreeNode = TreeView1.Nodes.Add(oApprenticeServerDoc.FullFileName, oApprenticeServerDoc.DisplayName)
Dim oRefFileDesc As ReferencedFileDescriptor
For Each oRefFileDesc In oApprenticeServerDoc.ReferencedFileDescriptors
If oRefFileDesc.DocumentType = Inventor.DocumentTypeEnum.kAssemblyDocumentObject Then
conj.Nodes.Add(oRefFileDesc.FullFileName, oRefFileDesc.DisplayName, 0, 0)
'add on list each new assy
'creates in order to list = Lid
Lid = Lid + 1
list.Add(New Linha(Lid, oApprenticeServerDoc.FullFileName, oRefFileDesc.FullFileName, False))
Else
conj.Nodes.Add(oRefFileDesc.FullFileName, oRefFileDesc.DisplayName, 1, 1)
End If
Next
End Sub
Private Sub BuildTreeViewRef(Painode As String, refDocName As String)
'open refdoc and local
Dim oRefDoc As String
oRefDoc = oApprenticeApp.FileManager.GetFullDocumentName(refDocName, 'Master')
Dim oAssyDoc As Inventor.ApprenticeServerDocument
oAssyDoc = oApprenticeApp.Open(refDocName)
'set node where include the refDocname, = painode
Dim Root() As TreeNode
Root = TreeView1.Nodes.Find(Painode, True) 'TreeView1.SelectedNode
Dim SubCj() As TreeNode
'check all roots from pai
'If Root.Length > 1 Then
For i = 0 To Root.Length - 1
SubCj = Root(i).Nodes.Find(refDocName, True) 'TreeView1.SelectedNode
Dim oRefFileDesc As ReferencedFileDescriptor
For Each oRefFileDesc In oAssyDoc.ReferencedFileDescriptors
If oRefFileDesc.DocumentType = Inventor.DocumentTypeEnum.kAssemblyDocumentObject Then
SubCj(0).Nodes.Add(oRefFileDesc.FullFileName, oRefFileDesc.DisplayName, 0, 0)
Lid = Lid + 1
list.Add(New Linha(Lid, refDocName, oRefFileDesc.FullFileName, False))
Else
SubCj(0).Nodes.Add(oRefFileDesc.FullFileName, oRefFileDesc.DisplayName, 1, 1)
End If
Next
Next
'check if really the item goes to treeview
For n = 0 To list.Count - 1
If list(n).pai = Painode And list(n).filho = refDocName Then
list(n).reg = True
End If
Next
End Sub
Public Class Linha
Public Property Id() As Integer
Public Property pai() As String
Public Property filho() As String
Public Property reg As Boolean
Public Sub New(ByVal _id As Integer, ByVal _pai As String, ByVal _filho As String, ByVal _reg As Boolean)
Me.Id = _id
Me.pai = _pai
Me.filho = _filho
Me.reg = _reg
End Sub
End Class

Match parts of a string

I have 2 strings that each contain 25 characters. E.g.
X = "0000111111110111111111110"
Y = "0000011111000000000000000"
What would be the most efficient method to identify, true or false if every position that has a "1" string Y also has a "1" in string X? In this example it should return True as there are 1s in X that match the positions of all 1s in Y.
I could read each character position and do a comparison for all 25 but was hoping some clever person would know of a more elegant way.
The easier way is to use Convert.ToInt32() to parse the string as a binary literal and perform binary AND:
Public Function MatchAsBinary(ByVal x As String, ByVal y As String) As Boolean
Dim x_int = Convert.ToInt32(x, 2)
Dim y_int = Convert.ToInt32(y, 2)
Return (x_int And y_int) = y_int
End Function
The faster (~10 times in release build) way is to compare the chars directly:
Public Function MatchAsChars(ByVal x As String, ByVal y As String) As Boolean
For i As Integer = 0 To y.Length - 1
If y(i) = "1"c AndAlso x(i) = "0"c Then
Return False
End If
Next
Return True
End Function
If you regard the strings as binary numbers, you can convert them to numbers and then use the bitwise and operator, like this:
Module Module1
Sub Main()
Dim X = "0000111111110111111111110"
Dim Y = "0000011111000000000000000"
Dim Xb = Convert.ToInt64(X, 2)
Dim Yb = Convert.ToInt64(Y, 2)
Console.WriteLine((Xb And Yb) = Yb)
Console.ReadLine()
End Sub
End Module
That will output True and work for strings of up to 64 characters.
Or, following on from your comment, you could use Convert.ToInt32 as that would give enough bits for your data.
Can do something similar #JoshD said above, but use Convert.ToInt32(Y, 2) to convert from a binary string to an integer.
Xint = Convert.ToInt32(X, 2)
Yint = Convert.ToInt32(Y, 2)
return ((Xint And Yint) = Yint)
This includes what others have shown plus a test for each bit one at a time.
Dim s As String = "0000011111000000000000000"
Dim X As String = "0000111111110111111111110"
Dim Y As String = "0000011111000000000000000"
Dim xi As Integer = Convert.ToInt32(X, 2)
Dim yi As Integer = Convert.ToInt32(Y, 2)
'check each bit
For i As Integer = 0 To 24
Dim msk As Integer = 1 << i
If (msk And xi) = msk AndAlso (msk And yi) = msk Then
Debug.WriteLine("Bit {0} on in both", i)
End If
Next
'all bits
Dim rslt As Integer = xi And yi
s = Convert.ToString(rslt, 2).PadLeft(25, "0"c)
Dim intY As Integer = CInt(Y)
Dim res As Boolean = (CInt(X) And intY) = intY
Convert them to integers, get all instances of matching 1's with a bitwise And, then compare to see if Y was changed by that comparison. If the comparison preserved the original Y, the result will be True.

How to stop second run of the code to prevent override data regex vba?

The below code will split 1 cell into 3 or 4 column based on a pattern of 6chr,5chr,4chr,5+chr. The below also needs to be available on all open workbooks and work from the user selection.
How to fix a bug that after the first splitting of the cell is done and by mistake you run it again will override the data?
Class Module
Option Explicit
'Rename this Class Module cFabric
Private pStyle As String
Private pFabric As String
Private pColour As String
Private pSize As String
Public Property Get Style() As String
Style = pStyle
End Property
Public Property Let Style(Value As String)
pStyle = Value
End Property
Public Property Get Fabric() As String
Fabric = pFabric
End Property
Public Property Let Fabric(Value As String)
pFabric = UCase(Value)
End Property
Public Property Get Colour() As String
Colour = pColour
End Property
Public Property Let Colour(Value As String)
pColour = Value
End Property
Public Property Get Size() As String
Size = pSize
End Property
Public Property Let Size(Value As String)
pSize = Value
End Property
Regular Module
Option Explicit
Sub Fabrics()
Dim wsSrc As Workbook, wsRes As Workbook
Dim vSrc As Variant, vRes As Variant, rRes As Range
Dim RE As Object, MC As Object
Const sPat As String = "^(.{6})\s*(.{5})\s*(.{4})(?:.*1/(\S+))?"
'Group 1 = style
'Group 2 = fabric
'Group 3 = colour
'Group 4 = size
Dim colF As Collection, cF As cFabric
Dim I As Long
Dim S As String
Dim V As Variant
'Set source and results worksheets and ranges
Set wsSrc = ActiveWorkbook
Set wsRes = ActiveWorkbook
Set rRes = wsRes.Application.Selection
'Read source data into array
vSrc = Application.Selection
'Initialize the Collection object
Set colF = New Collection
'Initialize the Regex Object
Set RE = CreateObject("vbscript.regexp")
With RE
.Global = False
.MultiLine = True
.Pattern = sPat
'Test for single cell
If Not IsArray(vSrc) Then
V = vSrc
ReDim vSrc(1 To 1, 1 To 1)
vSrc(1, 1) = V
End If
'iterate through the list
For I = 1 To UBound(vSrc, 1)
S = vSrc(I, 1)
Set cF = New cFabric
If .test(S) = True Then
Set MC = .Execute(S)
With MC(0)
cF.Style = .submatches(0)
cF.Fabric = .submatches(1)
cF.Colour = .submatches(2)
cF.Size = .submatches(3)
End With
Else
cF.Style = S
End If
colF.Add cF
Next I
End With
'create results array
'Exit if no results
If colF.Count = 0 Then Exit Sub
ReDim vRes(1 To colF.Count, 1 To 4)
'Populate the rest
I = 0
For Each V In colF
I = I + 1
With V
vRes(I, 1) = .Style
vRes(I, 2) = .Fabric
vRes(I, 3) = .Colour
vRes(I, 4) = .Size
End With
Next V
'Write the results
Set rRes = rRes.Resize(UBound(vRes, 1), UBound(vRes, 2))
rRes.Value = vRes
End Sub
Credits for the above goes to #Ron Rosenfeld for the project!
One way to tell if the entry has been previously split is as follows
If the regex.test fails, then
If the results line passes, then the item has been previously split
if not, then it is a blank, or a malformed entry
Note that a lot of this could be avoided if you were not overwriting your original data. I would recommend against overwriting your data both for audit and debugging purposes, but the below should help in case you cannot change that.
You just need to make some small changes in the logic where we checked for the malformed entry originally. As well as reading in the "possible" results array into vSrc so that we have the potentially split data to compare:
Option Explicit
Sub Fabrics()
'assume data is in column A
Dim wsSrc As Worksheet, wsRes As Worksheet
Dim vSrc As Variant, vRes As Variant, rRes As Range
Dim RE As Object, MC As Object
Const sPat As String = "^(.{6})\s*(.{5})\s*(.{4})(?:.*1/(\S+))?"
'Group 1 = style
'Group 2 = fabric
'Group 3 = colour
'Group 4 = size
Dim colF As Collection, cF As cFabric
Dim I As Long
Dim S As String
Dim V As Variant
'Set source and results worksheets and ranges
Set wsSrc = ActiveSheet
Set wsRes = ActiveSheet
Set rRes = Selection
'Read source data into array
vSrc = Selection.Resize(columnsize:=4)
'Initialize the Collection object
Set colF = New Collection
'Initialize the Regex Object
Set RE = CreateObject("vbscript.regexp")
With RE
.Global = False
.MultiLine = True
.Pattern = sPat
'iterate through the list
'Test for single cell
If Not IsArray(vSrc) Then
V = vSrc
ReDim vSrc(1 To 1, 1 To 1)
vSrc(1, 1) = V
End If
For I = 1 To UBound(vSrc, 1)
S = vSrc(I, 1)
Set cF = New cFabric
If .test(S) = True Then
Set MC = .Execute(S)
With MC(0)
cF.Style = .submatches(0)
cF.Fabric = .submatches(1)
cF.Colour = .submatches(2)
cF.Size = .submatches(3)
End With
ElseIf .test(vSrc(I, 1) & vSrc(I, 2) & vSrc(I, 3)) = False Then
cF.Style = S
Else
cF.Style = vSrc(I, 1)
cF.Fabric = vSrc(I, 2)
cF.Colour = vSrc(I, 3)
cF.Size = vSrc(I, 4)
End If
colF.Add cF
Next I
End With
'create results array
'Exit if not results
If colF.Count = 0 Then Exit Sub
ReDim vRes(1 To colF.Count, 1 To 4)
'Populate
I = 0
For Each V In colF
I = I + 1
With V
vRes(I, 1) = .Style
vRes(I, 2) = .Fabric
vRes(I, 3) = .Colour
vRes(I, 4) = .Size
End With
Next V
'Write the results
Set rRes = rRes.Resize(UBound(vRes, 1), UBound(vRes, 2))
With rRes
.Clear
.NumberFormat = "#"
.Value = vRes
.EntireColumn.AutoFit
End With
End Sub
Disregarding the previous regex/class method,
Option Explicit
Sub Fabrics_part_Deux()
Dim a As Long, b As Long
With Worksheets("Sheet1")
If .AutoFilterMode Then .AutoFilterMode = False
With .Range(.Cells(1, "A"), .Cells(.Rows.Count, "B").End(xlUp).Offset(0, 3))
With .Columns("B")
.Offset(1, 0).Replace what:=Chr(32), replacement:=vbNullString, lookat:=xlPart
End With
.AutoFilter field:=2, Criteria1:="<>"
.AutoFilter field:=3, Criteria1:=""
With .Resize(.Rows.Count - 1, 1).Offset(1, 1)
If CBool(Application.Subtotal(103, .Cells)) Then
With .SpecialCells(xlCellTypeVisible)
For a = 1 To .Areas.Count
With .Areas(a).Cells
.TextToColumns Destination:=.Cells(1), DataType:=xlFixedWidth, _
FieldInfo:=Array(Array(0, 1), Array(6, 1), Array(11, 1), Array(15, 2))
For b = 1 To .Rows.Count
.Cells(b, 2) = UCase$(.Cells(b, 2).Value2)
If CBool(InStr(1, .Cells(b, 4).Value2, Chr(47), vbBinaryCompare)) Then
.Cells(b, 4) = Trim(Split(.Cells(b, 4), Chr(47))(1))
End If
Next b
End With
Next a
End With
End If
End With
End With
If .AutoFilterMode Then .AutoFilterMode = False
End With
End Sub
In your code to output to the spreadsheet, you need to check for empty strings
I = 0
For Each V In colF
I = I + 1
With V
vRes(I, 1) = .Style
If len(.Fabric) > 0 then
vRes(I, 2) = .Fabric
vRes(I, 3) = .Colour
vRes(I, 4) = .Size
End If
End With
Next V

VBA Excel DLL Argument Issue - 6th Arg

so I've got this VBA code that calls DLL code. The DLL code works fine, the VBA code works fine UNTIL I go to call the DLL function from the VBA. For some reason it's not passing the 6th argument correctly. I tested by adding a 7th argument and passing the same value in the 6th and 7th arguments - the 7th passes fine, the 6th passes the same large (incorrect) value. I have no clue what is going on.
VBA:
Option Explicit
' Declare the LMM Function that's in the DLL
Declare PtrSafe Function GenCudaLMMPaths Lib "C:\Path to DLL\LMMExcel.dll" Alias "GenerateCUDALMMPaths" (xTimes#, xRates#, xVols#, xRData#, ByRef ArrLen As Long, ByRef NPaths As Long) As Long
' Generate LMM Paths on Click
Sub LMM_Click()
Dim Times#(), Rates#(), Vols#()
Dim x As Long
Dim y As Long
Dim rTimes As Range
Dim rRates As Range
Dim rVols As Range
Dim cell As Range
Dim sz As Long
sz = 15
' Resize
ReDim Times(sz), Rates(sz), Vols(sz)
' Fill in Data
Set rTimes = Sheets("Market").Range("C2:Q2")
x = 1
For Each cell In rTimes
Times(x) = cell.Value
x = x + 1
Next
Set rRates = Sheets("Market").Range("C5:Q5")
x = 1
For Each cell In rRates
Rates(x) = cell.Value
x = x + 1
Next
Set rVols = Sheets("Market").Range("C4:Q4")
x = 1
For Each cell In rVols
Vols(x) = cell.Value / 10000
x = x + 1
Next
'Call the Function
Dim np As Long
np = Sheets("LMM").Range("C2").Value
Dim useCuda As Boolean
If Sheets("LMM").Range("C3").Value = "GPU" Then
useCuda = True
Else
useCuda = False
End If
Dim rData#()
Dim rValue
ReDim rData(np * sz * (sz + 3))
rValue = GenCudaLMMPaths(Times(1), Rates(1), Vols(1), rData(1), sz, np)
If rValue = -1 Then
'No CUDA Card
MsgBox ("Your system doesn't have a CUDA Enabled GPU")
ElseIf rValue = 1 Then
'Error Occurred
MsgBox ("An error occurred while trying to generate LMM paths")
ElseIf rValue = 0 Then
'Success
' Need to reformat return data
Dim fmtData()
ReDim fmtData(np * sz, sz)
Dim i, j, k
For i = 0 To np - 1
For j = 0 To np - 1
For k = 0 To np - 1
fmtData(((i * sz) + j) + 1, k + 1) = rData(((i * sz * sz) + (j * sz) + k) + 1)
Next k
Next j
Next i
'Fill in data
Sheets("LMM").Range("A8:K" & (np * sz)) = fmtData
Else
'Too many requested paths for this CUDA card
MsgBox ("In order to prevent GPU Lock-up, you cannot request more than " & rValue & " paths.")
Sheets("LMM").Range("C2").Value = rValue
End If
End Sub
DLL Function Declaration:
int __stdcall GenerateCUDALMMPaths(double* arrTimes, double* arrRates, double* arrVols, double* retData, int& ArrLength, int& NPaths);
DEF File:
LIBRARY "CUDAFinance"
EXPORTS
CheckExcelArray = CheckExcelArray
GenerateLMMPaths = GenerateLMMPaths
GenerateCUDALMMPaths = GenerateCUDALMMPaths
Anyone have any idea here? I'm completely lost.
I just run into the same problem and got it solved as follows.
Since you already have a long variable in the six arguments function, import the NPaths together with Arrlen as an array without adding a 7th argument:
1) In VBA:
Declare a two elements array:
Dim NArrLenNPaths(1) as long
Then, assign values:
NArrLenNPaths(0) contains ArrLen and NArrLenNPaths(1) the NPaths value.
Keep the function delcaration in VBA but when calling it put NArrLenNPaths(0) as 6th argument. Do not put a 7th argument. The C++ will retreive both values as follows.
2) In C++ use a pointer instead:
Change the 6th argument to
int* NArrLenNPaths
then retreive the values by
int NArrLen = NArrLenNPaths[0];
int NPaths = NArrLenNPaths[1];

XNPV calculation in C# or VB without using excel

Has anyone developed a financial function XNPV? I am looking for code that will calculate this value. I am not looking to call excel to calculate the value.
Any help would be great, thanks
I believe this is the correct implementation in c#:
public double XNPV(decimal[] receipts,DateTime[] dates, double dRate, DateTime issueDate, decimal cf, int x)
{
double sum;
for(int i =0;i<dates.Length;i++)
{
TimeSpan ts = dates[i].Subtract(issuedate);
sum +=receipts[i] / ((1 + dRate / cf) ^ ((ts.TotalDays / 360) * cf));
}
return sum;
}
and this is the equivalent in VB:
Public Function XNPV(ByVal Receipts() As Decimal, ByVal Dates() As Date, ByVal dRate As Double, ByVal IssueDate As Date, ByVal CF As Decimal, ByVal x As Integer) As Double
Dim i As Integer
Dim sum As Double
For i = 0 To UBound(Dates)
sum = sum + Receipts(i) / ((1 + dRate / CF) ^ ((DateDiff / 360) * CF))
XNPV = sum
End Function
I adapted the code above and included an example of how to call it. It should be easy to adapt to use a variable discount rate instead of .1
Public Function XNPV(ByVal Receipts() As Decimal, ByVal Dates() As Date, ByVal dRate As Double, ByVal StartDate As Date) As Double
Dim i As Integer = 0
Dim dXNPV As Double = 0
Dim dXRate As Double = 1 + dRate
Dim dDayOfCF As Long = 0
Dim dAdj As Double = 0
Dim dResult As Double = 0
For i = 0 To UBound(Dates)
dDayOfCF = (DateDiff(DateInterval.Day, StartDate, Dates(i)))
dAdj = (dDayOfCF / 365)
dXNPV = Receipts(i) / (dXRate ^ dAdj)
dResult += dXNPV
Next
XNPV = dResult
End Function
Private Sub SimpleButton2_Click(sender As System.Object, e As System.EventArgs) Handles SimpleButton2.Click
Dim tblReceipts(2) As Decimal
Dim tblDates(2) As DateTime
Dim dtStartDate As DateTime = "1/1/13"
Dim XNPVResult As Decimal = 0
tblReceipts(0) = -100000
tblReceipts(1) = 500000
tblReceipts(2) = 2000000
tblDates(0) = "1/1/13"
tblDates(1) = "1/7/13"
tblDates(2) = "1/1/15"
'xCF = 1 + DateDiff(DateInterval.Day, dtIssueDate, tblDates(2))
XNPVResult = XNPV(tblReceipts, tblDates, 0.1, dtStartDate)
MsgBox(XNPVResult)
End Sub
Returns value same as Excels' XNPV (double-checked)
public static double XNPV(double rate, List<xnpvFlow> Cashflows, DateTime? StartDate = null)
{
if (Cashflows == null || Cashflows.Count == 0) return 0;
if(StartDate == null)
{
StartDate = (from xnpvFlow flow in Cashflows select flow.FlowDate).Min();
}
double _xnpv = 0;
foreach(xnpvFlow flow in Cashflows)
{
_xnpv += (double)flow.FlowAmount / Math.Pow((1 + rate), (double)(flow.FlowDate - (DateTime)StartDate).Days/365);
}
return _xnpv;
}
public class xnpvFlow
{
public xnpvFlow(DateTime _FlowDate, decimal _flowAmount)
{
this.FlowAmount = _flowAmount;
this.FlowDate = _FlowDate;
}
public DateTime FlowDate;
public decimal FlowAmount;
}