General Purpose UDFs for using Regular Expressions in Excel - regex

I need to parse and summarize and batches of several thousand text lines on a weekly basis. Excel wildcards weren't flexible enough, and I wanted to remove the extra step of either pasting into Notepad++ for processing or feeding to a script.
Here are the tools I came up with. They're still a bit slow -- perhaps 3000 lines per second on a company laptop -- but they are handy.
RXMatch -- return first match, option to return a subgroup.
=RXMatch("Apple","A(..)",1) -> "pp"
RXCount -- count number of matches
=RXCount("Apple","p") -> 2
RXPrint -- embed first match and/or subgroups into a template string
=RXPrint("Apple","(\S)\S+","\1 is for \0") -> "A is for Apple"
RXPrintAll -- embed each match into a template string, join the results
=RXPrintAll("Apple Banana","(\S)\S+","\1 is for \0") -> "A is for Apple, B is for Banana"
RXMatches -- return a vertical array of matches, option to return a subgroup
=RXMatches("Apple Banana","\S+") -> {"Apple";"Banana"}

RXMatch
Public Function RXMatch(Text As String, Pattern As String, Optional Group As Integer = 0, Optional IgnoreCase As Boolean = True) As String
Dim retval As String
' Takes a string and returns the matching text
' Text is the string to be searched
' Pattern is the regex pattern
' Group (optional) selects a parenthesized group (count the number of left parentheses preceding it to get the group number)
' IgnoreCase (optional) set to False for a case-sensitive search
Dim RE As Object
Dim Matches As Object
Set RE = CreateObject("vbscript.regexp")
RE.IgnoreCase = IgnoreCase
RE.Pattern = Pattern
Set Matches = RE.Execute(Text)
If (Matches.Count > 0) Then
If (Group > 0) Then
retval = Matches(0).submatches(Group - 1)
Else
retval = Matches(0)
End If
Else
retval = ""
End If
RXMatch = retval
End Function
RXCount
Public Function RXCount(Text As String, Pattern As String, Optional IgnoreCase As Boolean = True) As Integer
Dim retval As Integer
' Counts the number of matches
' Text is the string to be searched
' Pattern is the regex pattern
' IgnoreCase (optional) set to False for a case-sensitive search
Dim RE As Object
Dim Matches As Object
Set RE = CreateObject("vbscript.regexp")
RE.IgnoreCase = IgnoreCase
RE.Global = True
RE.Pattern = Pattern
Set Matches = RE.Execute(Text)
retval = Matches.Count
RXCount = retval
End Function
RXPrint
Public Function RXPrint(Text As String, Pattern As String, Optional Template As String = "\0", Optional IgnoreCase As Boolean = True) As String
Dim retval As String
' Takes a string and returns a new string formatted according to the given template, using the first match found
' Text is the string to be searched
' Pattern is the regex pattern
' Template (optional) is a string which should contain group identifiers (\0 - \9) to be substituted with groups in the match
' IgnoreCase (optional) set to False for a case-sensitive search
Dim REText, RETemplate As Object
Dim MatchesText, MatchesTemplate As Object
Set REText = CreateObject("vbscript.regexp")
REText.IgnoreCase = IgnoreCase
REText.Pattern = Pattern
Set MatchesText = REText.Execute(Text)
Set RETemplate = CreateObject("vbscript.regexp")
RETemplate.Global = True
RETemplate.Pattern = "(?:\\(.))|([^\\]+)"
Set MatchesTemplate = RETemplate.Execute(Template)
If (MatchesText.Count > 0) Then
ReDim retArray(0 To MatchesTemplate.Count - 1) As String
Dim escaped As String
Dim plaintext As String
For i = 0 To MatchesTemplate.Count - 1
escaped = MatchesTemplate(i).submatches(0)
plaintext = MatchesTemplate(i).submatches(1)
If (Len(escaped) > 0) Then
If (IsNumeric(escaped)) Then
Dim groupnum As Integer
groupnum = CInt(escaped)
If groupnum = 0 Then
retArray(i) = MatchesText(0)
ElseIf (groupnum > MatchesText(0).submatches.Count) Then
retArray(i) = "?"
Else
retArray(i) = MatchesText(0).submatches(groupnum - 1)
End If
Else
retArray(i) = escaped
End If
Else
retArray(i) = plaintext
End If
Next i
retval = Join(retArray, "")
Else
retval = ""
End If
RXPrint = retval
End Function
RXPrintAll
Public Function RXPrintAll(Text As String, Pattern As String, Optional Template As String = "\0", Optional Delimiter As String = ", ", Optional IgnoreCase As Boolean = True) As String
Dim retval As String
' Takes a string and returns a new string formatted according to the given template, repeated for each match
' Text is the string to be searched
' Pattern is the regex pattern
' Template (optional) is a string which should contain group identifiers (\0 - \9) to be substituted with groups in the match
' Delimiter (optional) specified how the results will be joined
' IgnoreCase (optional) set to False for a case-sensitive search
Dim REText, RETemplate As Object
Dim MatchesText, MatchesTemplate As Object
Set REText = CreateObject("vbscript.regexp")
REText.IgnoreCase = IgnoreCase
REText.Global = True
REText.Pattern = Pattern
Set MatchesText = REText.Execute(Text)
Set RETemplate = CreateObject("vbscript.regexp")
RETemplate.Global = True
RETemplate.Pattern = "(?:\\(.))|([^\\]+)"
Set MatchesTemplate = RETemplate.Execute(Template)
If (MatchesText.Count > 0) Then
ReDim retArrays(0 To MatchesText.Count - 1)
For j = 0 To MatchesText.Count - 1
ReDim retArray(0 To MatchesTemplate.Count - 1) As String
Dim escaped As String
Dim plaintext As String
For i = 0 To MatchesTemplate.Count - 1
escaped = MatchesTemplate(i).submatches(0)
plaintext = MatchesTemplate(i).submatches(1)
If (Len(escaped) > 0) Then
If (IsNumeric(escaped)) Then
Dim groupnum As Integer
groupnum = CInt(escaped)
If groupnum = 0 Then
retArray(i) = MatchesText(j)
ElseIf (groupnum > MatchesText(j).submatches.Count) Then
retArray(i) = "?"
Else
retArray(i) = MatchesText(j).submatches(groupnum - 1)
End If
Else
retArray(i) = escaped
End If
Else
retArray(i) = plaintext
End If
Next i
retArrays(j) = Join(retArray, "")
Next j
retval = Join(retArrays, Delimiter)
Else
retval = ""
End If
RXPrintAll = retval
End Function
RXMatches
Public Function RXMatches(Text As String, Pattern As String, Optional Group As Integer = 0, Optional IgnoreCase As Boolean = True) As Variant
Dim retval() As String
' Takes a string and returns all matches in a vertical array
' Text is the string to be searched
' Pattern is the regex pattern
' Group (optional) selects a parenthesized group (count the number of left parentheses preceding it to get the group number)
' IgnoreCase (optional) set to False for a case-sensitive search
Dim RE As Object
Dim Matches As Object
Set RE = CreateObject("vbscript.regexp")
RE.IgnoreCase = IgnoreCase
RE.Global = True
RE.Pattern = Pattern
Set Matches = RE.Execute(Text)
If (Matches.Count > 0) Then
ReDim retval(0 To Matches.Count - 1)
For i = 0 To Matches.Count - 1
If (Group > 0) Then
retval(i) = Matches(i).submatches(Group - 1)
Else
retval(i) = Matches(i)
End If
Next i
Else
ReDim retval(1)
retval(0) = ""
End If
RXMatches = Application.Transpose(retval)
End Function

When dealing with UDFs it's vital that you cache created objects.
For example:
Public Function RegexTest(ByVal vHaystack As Variant, ByVal sPattern As String, Optional ByVal sFlags As String = "") As Boolean
'If haystack is an error then return false
If IsError(vHaystack) Then Exit Function
'Stringify haystack
Dim sHaystack As String: sHaystack = vHaystack
'Cache regular expressions, especially important for formulae
Static lookup As Object
If lookup Is Nothing Then Set lookup = CreateObject("Scripting.Dictionary")
'If cached object doesn't exist, create it
Dim sKey As String: sKey = sPattern & "-" & sFlags
If Not lookup.exists(sKey) Then
'Create regex object
Set lookup(sKey) = CreateObject("VBScript.Regexp")
'Bind flags
For i = 1 To Len(sFlags)
Select Case Mid(sFlags, i, 1)
Case "i"
lookup(sKey).IgnoreCase = True
Case "g"
lookup(sKey).Global = True
End Select
Next
'Set pattern
lookup(sKey).Pattern = sPattern
End If
'Use test function of regex object
RegexTest = lookup(sKey).test(sHaystack)
End Function
Applying this to your own functions, you'll see this vastly increases the speed of execution on a large number of cells.

Related

How to save SubMatches as array and print not empty submatches?

When I try the following Regex code and add a "Add Watch" (Shift + F9) to Matches
Sub TestRegEx1()
Dim regex As Object, Matches As Object
Dim str As String
str = "This is text for the submatches"
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "Th(is).+(for).+(submatches)|.+(\d)|([A-Z]{3})"
regex.IgnoreCase = True
Set Matches = regex.Execute(str)
End Sub
I see that Matches is structured like this (with 2 empty submatches):
2 questions:
How can I save in an array variable the SubMatches?
How can I Debug.Print only elements that are not empty?
I've tried doing like below but is not working
Set Arr = Matches.SubMatches
Set Arr = Matches(1).SubMatches
Set Arr = Matches.Item(1).SubMatches
Thanks in advance
Is the following what you intended? Oversize an array at the start and redim at the end. First version prints only non-empty but stores all. Second version prints and stores only non-empty.
You probably want to .Test to ensure there are matches.
Option Explicit
Sub TestRegEx1()
Dim regex As Object, matches As Object, match As Object, subMatch As Variant
Dim str As String, subMatches(), i As Long
ReDim subMatches(0 To 1000)
str = "This is text for the submatches"
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "Th(is).+(for).+(submatches)|.+(\d)|([A-Z]{3})"
regex.IgnoreCase = True
Set matches = regex.Execute(str)
For Each match In matches
For Each subMatch In match.subMatches
subMatches(i) = match.subMatches(i)
If Not IsEmpty(subMatches(i)) Then Debug.Print subMatches(i)
i = i + 1
Next
Next
ReDim Preserve subMatches(0 To i)
End Sub
If you only want to store non-empty then
Option Explicit
Sub TestRegEx1()
Dim regex As Object, matches As Object, match As Object, subMatch As Variant
Dim str As String, subMatches(), i As Long
ReDim subMatches(0 To 1000)
str = "This is text for the submatches"
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "Th(is).+(for).+(submatches)|.+(\d)|([A-Z]{3})"
regex.IgnoreCase = True
Set matches = regex.Execute(str)
For Each match In matches
For Each subMatch In match.subMatches
subMatches(i) = match.subMatches(i)
If Not IsEmpty(subMatches(i)) Then
Debug.Print subMatches(i)
i = i + 1
End If
Next
Next
ReDim Preserve subMatches(0 To i)
End Sub
You may use a Collection and fill it on the go.
Add
Dim m, coll As Collection
Initialize the collection:
Set coll = New Collection
Then, once you get the matches, use
If Matches.Count > 0 Then ' if there are matches
For Each m In Matches(0).SubMatches ' you need the first match submatches
If Len(m) > 0 Then coll.Add (m) ' if not 0 length, save value to collection
Next
End If
Result of the code with changes:

Match all dates without an asterisk in front

I'm trying to use negative lookback to match all dates without an asterisk in front but it doesn't seem to be working.
(?<!\\*)(\b(?:0[1-9]|[0-2])/(?:0[1-9]|[12]\d|3[01])/(?:19\d{2}|[2-9]\d{3})\b)
This is the string I'm trying to match:
02/02/2019 *03/20/2019 AB CART 9000341 FAXED TO INSTITUTION
Here's the full code for what I have. It extracts the most recent date preceding the word faxed. The problem is if there is a date with an asterisk in front of it (such as *03/20/2019) it chooses that instead of the date (02/02/2019)
This is the Function:
Option Explicit
Function lastFaxedDt(s As String) As Date
Dim re As RegExp, MC As MatchCollection
Const sPat As String = "(\b(?:0[1-9]|1[0-2])/(?:0[1-9]|[12]\d|3[01])/(?:19\d{2}|[2-9]\d{3})\b)(?=.*?faxed)"
Set re = New RegExp
With re
.Pattern = sPat
.IgnoreCase = True
.Global = True
If .Test(s) = True Then
Set MC = .Execute(s)
lastFaxedDt = CDate(MC(MC.Count - 1))
End If
End With
End Function
This is the Macro:
Sub ExtractDate()
marker = 0
Set objShell = CreateObject("Shell.Application")
IE_count = objShell.Windows.Count
For x = 0 To (IE_count - 1)
On Error Resume Next
my_url = objShell.Windows(x).document.Location
my_title = objShell.Windows(x).document.Title
If my_title Like "RFT" & "*" Then
Set IE = objShell.Windows(x)
marker = 1
Exit For
Else
End If
Next
Dim Text As String
Text = Trim$(IE.document.getElementById("ctl00_ContentPlaceHolder1_txtNotes").innerText)
ExtractedDate = lastFaxedDt(Text)
If ExtractedDate = "12:00:00 AM" Then
ExtractedDate = "0"
Else
End If
ExtractedDate = CLng(ExtractedDate)
MaxDate = Application.WorksheetFunction.Max(ExtractedDate)
If MaxDate = "0" Then
MsgBox "No Date Found"
Else
End If
MaxDate = CDate(MaxDate)
Dim ws5 As Worksheet: Set ws5 = ActiveWorkbook.ActiveSheet
ws5.Range("C" & (ActiveCell.Row)).Value = MaxDate
Range("C" & (ActiveCell.Row)).NumberFormat = "[$-409]d-mmm;#"
End Sub
As mentioned in the comments, VBA does not support Lookbehinds. To work around this, you can replace your Lookbehind with the following:
(?:^|[^*])
And then find the date in the capturing group (sub-match) instead of the full match. In this case, your function should look something like this:
Function lastFaxedDt(s As String) As Date
Const sPat As String = _
"(?:^|[^*])" & _
"(\b(?:0[1-9]|1[0-2])/(?:0[1-9]|[12]\d|3[01])/(?:19\d{2}|[2-9]\d{3})\b)" & _
"(?=.*?faxed)"
Dim re As New RegExp, matches As MatchCollection
With re
.Pattern = sPat
.IgnoreCase = True
.Global = True
Set matches = .Execute(s)
If matches.Count > 0 Then
Dim lastMatch As Match: Set lastMatch = matches(matches.Count - 1)
lastFaxedDt = CDate(lastMatch.SubMatches.Item(0))
Else
' TODO: handle the case where no matches are found
End If
End With
End Function
Usage:
Dim s As String
s = "02/02/2019 *03/20/2019 AB CART 9000341 FAXED TO INSTITUTION"
MsgBox lastFaxedDt(s) ' 02/02/2019

Excel VBA RegEx that extracts numbers from price values in range (has commas, $ and -)

I have a field data extracted from a database which represents a range of values, but it's coming in Excel as a String format $86,000 - $162,000.
I need to extract the minimum value and the maximum value from each cell, so I need to extract the numeric portion of it, and ignore the $, - and the ,.
I've attached an image of the data I have, and the values I want to extract from it.
This is the closest pattern I got with RegEx, but I'ts not what I'm looking for.
Pattern = (\d+)(?:\.(\d{1,2}))?
Can anyone assist ?
Just wondering why Regex?
Function GetParts(priceRange As String) As Double()
Dim arr() As String
Dim parts() As Double
If InStr(1, priceRange, "-") > 0 Then
arr = Split(priceRange, "-")
ReDim parts(0 To UBound(arr))
Dim i As Long
For i = 0 To UBound(arr)
parts(i) = CDbl(Replace$(Replace$(Trim$(arr(i)), "$", ""), ",", ""))
Next i
End If
GetParts = parts
End Function
Sub test()
MsgBox GetParts("$14,000 - $1,234,567")(0) 'Minimum
End Sub
EDIT
Yet you could do this with regex to match the data string into the parts:
Function GetPartsRegEx(priceRange As String) As Variant
Dim arr() As Double
Dim pricePattern As String
pricePattern = "(\$?\d+[\,\.\d]*)"
'START EDIT
Static re As RegExp
If re Is Nothing Then
Set re = New RegExp
re.IgnoreCase = True
re.Global = True
re.Pattern = pricePattern & "\s*[\-]\s*" & pricePattern 'look for the pattern first
End If
Static nums As RegExp
If nums Is Nothing Then
Set nums = New RegExp
'to remove all non digits, except decimal point in case you have pennies
nums.Pattern = "[^0-9.]"
nums.Global = True
End If
'END EDIT
If re.test(priceRange) Then
ReDim arr(0 To 1) ' fill return array
arr(0) = CDbl(nums.Replace(re.Replace(priceRange, "$1"), ""))
arr(1) = CDbl(nums.Replace(re.Replace(priceRange, "$2"), ""))
Else
'do some error handling here
Exit Function
End If 'maybe throw error if no +ve test or
GetPartsRegEx = arr
End Function
Sub test()
MsgBox GetPartsRegEx("$1,005.45 - $1,234,567.88")(1)
End Sub
Here is quick Example Demo https://regex101.com/r/RTNlVF/1
Pattern "^\$(\d+\,\d+)\s\-\s\$(\d+\,\d+)"
Option Explicit
Private Sub Example()
Dim RegExp As New RegExp
Dim Pattern As String
Dim CelValue As String
Dim rng As Range
Dim Cel As Range
Set rng = ActiveWorkbook.Sheets("Sheet1" _
).Range("A2", Range("A9999" _
).End(xlUp))
For Each Cel In rng
DoEvents
Pattern = "^\$(\d+\,\d+)\s\-\s\$(\d+\,\d+)"
If Pattern <> "" Then
With RegExp
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = Pattern
End With
If RegExp.Test(Cel.Value) Then
' Debug.Print Cel.Value
Debug.Print RegExp.Replace(CStr(Cel), "$1")
Debug.Print RegExp.Replace(CStr(Cel), "$2")
End If
End If
Next
End Sub
Without a loop (but still no regex):
Sub Split()
With Columns("B:B")
.Replace What:="$", Replacement:=""
Application.CutCopyMode = False
.TextToColumns Destination:=Range("B1"), DataType:=xlDelimited, Other:=True, OtherChar:="-", FieldInfo:=Array(Array(1, 1), Array(2, 1))
End With
Columns("B:C").Insert Shift:=xlToRight
Columns("D:E").NumberFormat = "0"
Range("D1").FormulaR1C1 = "Min Value"
Range("E1").FormulaR1C1 = "Max Value"
With Range("D1:E1").Interior
.Pattern = xlSolid
.PatternColorIndex = xlAutomatic
.Color = 12611584
End With
With Range("D1:E1").Font
.ThemeColor = xlThemeColorDark1
.TintAndShade = 0
End With
End Sub
I made this function:
Hope it helps.
Code:
Function ExtractNumber(ByVal TextInput As String, _
Optional ByVal Position As Byte = 0, _
Optional ByVal Delimiter As String = "-") As Variant
' You can use this function in a subprocess that
' writes the values in the cells you want, or
' you can use it directly in the ouput cells
' Variables
Dim RemoveItems(2) As String
Dim Aux As Variant
' The variable RemoveItems is an array
' containing the characters you want to remove
RemoveItems(0) = "."
RemoveItems(1) = ","
RemoveItems(2) = " "
' STEP 1 - The variable Aux will store the text
' given as input
Aux = TextInput
' STEP 2 - Characters stored in the variable
' RemoveItems will be removed from Aux
For i = 0 To UBound(RemoveItems)
Aux = Replace(Aux, RemoveItems(i), "")
Next i
' STEP 3 - Once Aux is "clean", it will be
' transformed into an array containing the
' values separated by the delimiter
' As you can see at the function's header,
' Delimiter default value is "-". You can change
' it depending on the situation
Aux = Split(Aux, Delimiter)
' STEP 4 - The result of this function will be
' a numeric value. So, if the value of the
' selected position in Aux is not numeric it will
' remove the first character assuming it is a
' currency symbol.
' If something fails in the process the function
' will return "ERROR", so you can know you may
' verify the inputs or adjust this code for
' your needs.
On Error GoTo ErrHndl
If Not IsNumeric(Aux(Position)) Then
ExtractNumber = CLng(Mid(Aux(Position), 2))
Else
ExtractNumber = CLng(Aux(Position))
End If
Exit Function
ErrHndl:
ExtractNumber = "ERROR"
End Function
You can even do this with just worksheet formulas. Under certain circumstances, Excel will ignore the $ and ,. The double unary converts the returned string to a numeric value.
First Value: =--LEFT(A1,FIND("-",A1)-1)
Second Value: =--MID(A1,FIND("-",A1)+1,99)

Excel regex match and return multiple references in formula

I've created a function that will return the Nth reference which includes a sheetname (if it's there), however it's not working for all instances. The regex string I'm using is
'[\w ]+[']!([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})
I'm finding though it won't find the first reference in either of the below examples:
='Biscuits Raw Data'!G783/'Biscuits Raw Data'!E783
=IF('Biscuits Raw Data'!G705="","",'Biscuits Raw Data'!G723/'Biscuits Raw Data'!G7005*100)
Below is my Function code:
Function GrabNthreference(Rng As range, NthRef As Integer) As String
Dim patrn As String
Dim RegX
Dim Matchs
Dim RegEx
Dim FinalMatch
Dim Subm
Dim i As Integer
Dim StrRef As String
patrn = "'[\w ]+[']!([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})"
StrRef = Rng.Formula
Set RegEx = CreateObject("vbscript.regexp") ' Create regular expression.
RegEx.Global = True
RegEx.Pattern = patrn ' Set pattern.
RegEx.IgnoreCase = True ' Make case insensitive.
Set RegX = RegEx.Execute(StrRef)
If RegX.Count < NthRef Then
GrabNthreference = StrRef
Exit Function
End If
i= -1
For Each Matchs In RegX ' Iterate Matches collection.
Set Subm = RegX(i).submatches
i = i + 1
If i = NthRef -1 Then
GrabNthreference = RegX(i)
Exit Function
End If
'Debug.Print RegX(i)
Next
End Function
Here's my final code
Function GrabNthreference(R As range, NthRef As Integer) As String 'based on http://stackoverflow.com/questions/13835466/find-all-used-references-in-excel-formula
Dim result As Object
Dim testExpression As String
Dim objRegEx As Object
Dim i As Integer
i = 0
Set objRegEx = CreateObject("VBScript.RegExp")
objRegEx.IgnoreCase = True
objRegEx.Global = True
objRegEx.Pattern = """.*?""" ' remove expressions
testExpression = CStr(R.Formula)
testExpression = objRegEx.Replace(testExpression, "")
'objRegEx.Pattern = "(([A-Z])+(\d)+)" 'grab the address think this is an old attempt so remming out
objRegEx.Pattern = "(['].*?['!])?([[A-Z0-9_]+[!])?(\$?[A-Z]+\$?(\d)+(:\$?[A-Z]+\$?(\d)+)?|\$?[A-Z]+:\$?[A-Z]+|(\$?[A-Z]+\$?(\d)+))"
If objRegEx.Test(testExpression) Then
Set result = objRegEx.Execute(testExpression)
If result.Count > 0 Then
For Each Match In result
Debug.Print Match.Value
If i = NthRef - 1 Then
GrabNthreference = result(i)
Exit Function
End If
i = i + 1
Next Match
Else
GrabNthreference = "No precedencies found"
End If
End If
End Function
This code did lead me onto thinking about using the simple activecell.precedences method but I think the problem is that it won't report offsheet and won't indicate if the formula is relative or absolute.
Any comments welcome but I think I've answered my own question :)

How to find words on both sides of (period)

in the following examples I need to get the words on either side on the period
I am using this regex
Dim myRegex As New Regex("[^\w]+")
Dim mymatch As String() = myRegex.Split(currentField)
where as currentfield = one of the following 3 samples
Contacts.Address2 as `Contact Address2`
Contacts.ContactID
CONCAT(Contacts.FirstName;;' ';;Contacts.LastName) as `Contact`
returns are as follows.
1-- Contacts, Address2, as, Contact and Address2 do not want the word as.
2-- Contacts and ContactID this is ok.
3-- CONCAT,Contacts,FirstName,Contacts,LastName,as and Contact.
3rd one this is too much do not want CONCAT,as or Contact. I only want the four words (ones before and after the period) to be returned Contacts, Firstname, Contacts, and Lastname
how can I write the regex to only get words before and after the period
I would consider matching vs. splitting the input:
For Each m As Match In Regex.Matches(input, "(\w+)\.(\w+)")
Console.WriteLine(
String.Join(", ",
m.Groups(1).Value,
m.Groups(2).Value
))
Next
This is an example, It's not clear what you expect to do with the returned results.
Ideone Demo
I think you are looking to only split inside round brackets and you are not interested in the word as. Thus, I suggest a 2 step approach:
Get the substring(s) in round brackets (\([^()]+\) regex)
If there are such substrings, split them with the split regex, if not, split the original string with split regex (\W+|\s*\bas\b\s* regex).
Sample code:
'Dim currentField As String = "Contacts.Address2 as `Contact Address2`"
Dim currentField As String = "CONCAT(Contacts.FirstName;;' ';;Contacts.LastName) as `Contact`"
'Dim currentField As String = "Contacts.ContactID"
Dim myRegex As New Regex("\([^()]+\)")
Dim splitRegex As New Regex("\W+|\s*\bas\b\s*")
Dim mymatch As MatchCollection = myRegex.Matches(currentField)
If mymatch.Count > 0 Then
For Each match As Match In mymatch
Dim mysubstrs As String() = splitRegex.Split(match.Value)
For Each substr As String In mysubstrs
If String.IsNullOrEmpty(substr) = False Then
Console.WriteLine(substr)
End If
Next
Next
Else
Dim mysubstrs As String() = splitRegex.Split(currentField)
For Each substr As String In mysubstrs
If String.IsNullOrEmpty(substr) = False Then
Console.WriteLine(substr)
End If
Next
End If
here is the final working routine, based on the accepted answer above
Public Sub Load_Field_List(FieldSTR As String, FieldType As String)
Dim t As New FileIO.TextFieldParser(New System.IO.StringReader(FieldSTR))
t.TextFieldType = Microsoft.VisualBasic.FileIO.FieldType.Delimited
t.Delimiters = New String() {","}
Dim currentRow As String()
Dim dr As DataRow
Dim ColListSTR As String = loadeddataview.Tables(0).Rows(0).Item("ColumnList")
Dim ColListSTRArr As String() = ColListSTR.Split(",")
While Not t.EndOfData
Try
currentRow = t.ReadFields()
Dim currentField As String 'field string
For Each currentField In currentRow
Dim startName As Integer
Dim endName As Integer
Dim name As String
dr = fieldDT.NewRow
Dim isValid As Boolean = False
If currentField = "" Then 'make sure current field has data
isValid = False
ElseIf (Regex.IsMatch(currentField, "(\w+)\.(\w+)")) = True Then 'make sure current field has xxxx.yyyy pattern
Dim m As Match = Regex.Match(currentField, "(\w+)\.(\w+)") 'sets m to the first xxxx.yyyy pattern
dr("Table") = m.Groups(1).Value 'sets table column to table name xxxx
dr("Column Name") = "`" & m.Groups(2).Value & "`" 'sets column name to column yyyy enclosed in ` `
If ColListSTRArr.Contains(m.Groups(2).Value) Then 'checks columnlist str to see if column visible
dr("Show") = "True"
Else
dr("Show") = "False"
End If
' this section overrides column name if it was set using AS `zzzzz` statement
startName = currentField.IndexOf("`")
endName = currentField.IndexOf("`", If(startName > 0, startName + 1, 0))
If (endName > startName) Then
Dim mylength As Integer = currentField.Length
name = currentField.Substring(startName, endName - startName + 1)
dr("Column Name") = name 'set override columname
dr("Field") = currentField.Substring(0, startName - 4) 'sets field minus the " as 'ZZZZZ" above
If ColListSTRArr.Contains(currentField.Substring(startName + 1, endName - startName - 1)) Then 'dup may be able to remove
dr("Show") = "True"
Else
dr("Show") = "False"
End If
Else
dr("Field") = currentField 'sets field if there was no " as `ZZZZZZ`" in string
End If
If FieldType = "Field" Then 'sets the column linking field
dr("Linking") = "No Linking"
Else
dr("Linking") = FieldType
End If
End If
' commit changes
fieldDT.Rows.Add(dr)
fieldDT.AcceptChanges()
DataGridView3.DataSource = fieldDT
DataGridView3.ClearSelection()
Next
Catch ex As Microsoft.VisualBasic.
FileIO.MalformedLineException
MsgBox("Line " & ex.Message &
"is not valid and will be skipped.")
End Try
End While
End Sub