Crawling web page links through Visaul Basic 2008 - regex

I have tried to convert one of my Visual Basic 6 programs to Visual Basic.NET. This program basically crawls email and mobile number data from a web page link. This program works great in Visual Basic 6 but after converting to Visual Basic.NET is not providing any function or result in .NET.
Imports VBScript_RegExp_55
Imports System.Collections.Specialized
Imports System.Diagnostics
Public Class autoextratorform
Dim emailsearch As New ListViewItem()
Dim numbersearch As New ListViewItem
Private Declare Function SendMessageByString _
Lib "user32" _
Alias "SendMessageA" _
(ByVal hwnd As Long, _
ByVal wMsg As Long, _
ByVal wParam As Long, _
ByVal lParam As String) _
As Long
Private Const LB_SELECTSTRING = &H18C
Private Declare Function ShellExecute Lib "shell32.dll" Alias "ShellExecuteA" (ByVal hwnd As Long, ByVal lpOperation As String, ByVal lpFile As String, ByVal lpParameters As String, ByVal lpDirectory As String, ByVal nShowCmd As Long) As Long
Private WithEvents cIE As InternetHtml
Private WithEvents cExtLinks As cExtractLinks
Private cGSearch As cGoogleSearch
Dim Excel As Object
Dim Excelsheet As Object
Dim a() As String
Dim b() As String
Dim c() As String
Dim i As Integer
' Needs reference to Microsoft VBscript Regular Expressions I recomend ver 5.5.
'
Dim baseurl As String ' var to store base url so we can build the full path
Dim dVisited ' Dictionary to hold visited urls = I think Missing here
Dim dEmail ' dictionary to hold emails = I think Missing here
Dim dnumber ' = I think Missing here
Dim dweb '= I think Missing here
' We are putting the emails in a list also, for user feed back
'It would be less momery intensive and faster to just keep these in the dictionry object
'which allows to easily tell if the email already exist
Dim regxPage ' var to hold regular expression to extract urls
Dim regxEmail ' var to hold regular expression to extract emails
Dim regnumber
Dim regweb
Dim Match, Matches As String ' we use these to store are regx matches
Dim Match1, Matches1 As String
Dim Match2, Matches2 As String
' Regular expressions are super powerfull and have been a part of unix for a long time
' goto the form load event to see the regex initialization
' to learn more about regular expressions and to download the latest scripting runtime see
' http://msdn.microsoft.com/scripting/default.htm?/scripting/vbscript/techinfo/vbsdocs.htm
Dim email, pageurl, mnumber, sweb As String
' The above is only because dictionary.exist does not work directly on Match var
Dim stopcrawl As Integer ' Used to exit crawl loop
Private Sub ComboBox1_SelectedIndexChanged(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles ComboBox1.SelectedIndexChanged
Select Case ComboBox1.SelectedIndex
Case 0
ListBox2.Items.Clear()
ListBox2.Items.Add("http://classads.jang.com.pk/default.asp?cityid=1")
ListBox2.Items.Add("http://www.merapakistan.com/classifieds/")
ListBox2.Items.Add("http://olx.com.pk/")
ListBox2.SelectedIndex = 0
txtStartUrl.Text = ListBox2.Text
Case 1
ListBox2.Items.Clear()
ListBox2.Items.Add("http://www.locanto.com.pk/")
ListBox2.Items.Add("http://pakistanclassifiedads.com/")
ListBox2.Items.Add("http://pkr.com.pk/")
txtStartUrl.Text = ListBox2.Text
Case 2
ListBox2.Items.Clear()
ListBox2.Items.Add("http://adpost.pk/")
ListBox2.Items.Add("http://page.com.pk/")
ListBox2.Items.Add("http://www.clasf.pk/")
ListBox2.SelectedIndex = 0
txtStartUrl.Text = ListBox2.Text
Case 3
ListBox2.Items.Clear()
ListBox2.Items.Add("http://www.nelaam.com/")
ListBox2.Items.Add("http://www.asani.com.pk/")
ListBox2.Items.Add("http://freepost.com.pk/")
ListBox2.SelectedIndex = 0
txtStartUrl.Text = ListBox2.Text
End Select
End Sub
Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Command1.Click
stopcrawl = 0 ' set stop crawl so we do not exit loop
If txtStartUrl.Text & "" = "" Then
lblresult.Text = ("Please Load your Desired Site List !")
Exit Sub
ElseIf txtStartUrl.Text & "" = "http://" Then
lblresult.Text = ("Please Load your Desired Site List !")
Exit Sub
ElseIf txtStartUrl.Text = "" Then
lblresult.Text = ("Please Load your Desired Site List !")
Exit Sub
End If
' the above should really check for a valid url, but I am a lazy PERL programmer
ListBox1.Items.Add(txtStartUrl.Text) 'add item to list
Label16.Text = (ListBox2.Items.Count - 1)
Label8.Text = (ListBox2.Items.Count)
lblresult.Text = " Start service for searching Email address and Mobile Numbers"
Command1.Enabled = False
ComboBox1.Enabled = False
LaVolpeButton4.Enabled = True
'UPGRADE_ISSUE: (2064) ComboBox property Combo1.Locked was not upgraded. More Information: https://docs.mobilize.net/vbuc/ewis#2064
LaVolpeButton8.Enabled = False
LaVolpeButton9.Enabled = False
txtStartUrl.ReadOnly = True
crawl() ' and were off
End Sub
Private Sub autoextrator_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
With listnumber
.Columns.Clear()
.Columns.Add("Count!", CInt(Width * 0.1))
.Columns.Add("Mobile Number!", CInt(Width * 0.2))
.Columns.Add("Web URL!", CInt(Width * 0.68))
End With
With listemail
.Columns.Clear()
.Columns.Add("Count!", CInt(Width * 0.1))
.Columns.Add("Email Address!", CInt(Width * 0.2))
.Columns.Add("Web URL!", CInt(Width * 0.68))
End With
'initialize dictionary and regx objects
dVisited = CreateObject("Scripting.Dictionary")
dVisited.CompareMode = CompareMethod.Binary
dEmail = CreateObject("Scripting.Dictionary")
dEmail.CompareMode = CompareMethod.Binary
dnumber = CreateObject("Scripting.Dictionary")
dnumber.CompareMode = CompareMethod.Binary
dweb = CreateObject("Scripting.Dictionary")
dweb.CompareMode = CompareMethod.Binary
Dim counting1 As String
Dim counting2 As String
regxPage = New VBScript_RegExp_55.RegExp() ' Create a regular expression.
regxPage.Pattern = "HREF=""[^""#]+[.][^""#]+" ' Set pattern."
regxPage.IgnoreCase = True ' Set case insensitivity.
regxPage.Global = True ' Set global applicability.
regxEmail = New VBScript_RegExp_55.RegExp() ' Create a regular expression.
regxEmail.Pattern = "\b[a-z0-9._%+-]+#[a-z0-9.-]+\.[a-z]{2,4}\b" ' Set pattern."
regxEmail.IgnoreCase = True ' Set case insensitivity.
regxEmail.Global = True ' Set global applicability.
regnumber = New VBScript_RegExp_55.RegExp()
counting1 = "(\+92)-?\d{3}-?\d{7}"
counting2 = "(?:0092|0|\+92)-?\d{3}-?\d{7}"
regnumber.Pattern = counting1
regnumber.Pattern = counting2
regnumber.IgnoreCase = True
regnumber.Global = True
cIE = New InternetHtml()
cGSearch = New cGoogleSearch()
cExtLinks = New cExtractLinks()
End Sub
Sub crawl()
While ListBox1.Items.Count > 0 ' loop while list has data
If stopcrawl = 1 Then GoTo exitcrawl
getpage(ListBox1.Items(0)) ' This is the heart of the prog, except for the regx = I think Missing here
' stuff in the form load event
ListBox1.Items.Remove(0) ' remove item from list
End While
ListBox2.Items.Remove(ListBox2.SelectedIndex)
If ListBox2.SelectedIndex < ListBox2.Items.Count - 1 Then
ListBox2.SelectedIndex += 1
txtStartUrl.Text = ListBox2.Items.ToString
exitcrawl:
If ListBox2.Items.Count = 0 Then
MessageBox.Show("Searching has completed !", "Done ! searching", MessageBoxButtons.OK, MessageBoxIcon.Information)
txtStartUrl.ReadOnly = False
txtStartUrl.Text = ""
Label3.Text = "-----------------------------------------------------------"
LaVolpeButton4.Enabled = False
Command1.Enabled = True
LaVolpeButton8.Enabled = True
LaVolpeButton9.Enabled = True
End If
End If
End Sub
Sub getpage(ByVal page As String)
On Error Resume Next
If dVisited.Contains(page) Then
Exit Sub
Else
dVisited.Add(page, 1) ' add page to dVisited dictionary
Label6.Text = CStr(ListBox1.Items.Count)
Label14.Text = CStr(dVisited.Count)
Label3.Text = page
End If
baseurl = getpath(page) ' build full url - see getpath
TextBox1.Text = ""
If ListBox1.Items.Count > 5000 Then Exit Sub ' sets the maximum cache (so we don't run out of mem)
Using wc As New System.Net.WebClient
TextBox1.Text = wc.DownloadString(page)
End Using
Matches = regxPage.Execute(TextBox1.Text) ' Execute search.
For Each Me.Match In Matches
pageurl = Match
If InStr(1, pageurl, "http://", vbTextCompare) Then
If dVisited.Exists(pageurl) = False Then ListBox1.Items.Add(Mid(pageurl, 7))
Else
If dVisited.Exists(baseurl & Mid(pageurl, 7)) = False Then ListBox1.Items.Add(baseurl & Mid(pageurl, 7))
End If
Next
' search for email
Matches = regxEmail.Execute(TextBox1.Text) ' Execute search.
For Each Me.Match In Matches ' Iterate Matches collection.
' check if email exist
email = Match
Debug.Print(email & dEmail.Exists(email))
If dEmail.Exists(email) = False Then
dEmail.Add((email), 1)
Dim d As Integer
d = listemail.Items.Count
d = d + 1
emailsearch = listemail.Items.Add(d)
emailsearch.SubItems.Add(email.ToString)
emailsearch.SubItems.Add(pageurl.ToString)
ListBox3.Items.Add(email)
Label10.Text = listemail.Items.Count
End If
Next
Matches1 = regnumber.Execute(TextBox1.Text)
For Each Me.Match1 In Matches1
mnumber = Match1
Debug.Print(mnumber & dnumber.Exists(mnumber))
If dnumber.Exists(mnumber) = False Then
dnumber.Add((mnumber), 1)
Dim c As Integer
c = listnumber.Items.Count
c = c + 1
numbersearch = listnumber.Items.Add(c)
numbersearch.SubItems.Add(mnumber.ToString)
numbersearch.SubItems.Add(pageurl.ToString)
Label12.Text = listnumber.Items.Count
End If
Next
End Sub
Function getpath(ByVal URL As String) As String
' look for the last / and get a string up to that location
Dim lastbar As Integer = URL.LastIndexOf("/") + 1
Dim tmppath As String = URL.Substring(0, Math.Min(lastbar, URL.Length))
If tmppath = "http://" Then tmppath = URL ' full path already so return url
Return tmppath
End Function
Public Function StripOut(ByVal From As String, ByVal What As String) As String
Dim result As String = ""
result = From
For i As Integer = 1 To Strings.Len(What)
result = result.Replace(What.Substring(i - 1, Math.Min(1, What.Length - (i - 1))), "")
Next i
Return result
End Function
Private Sub LaVolpeButton4_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles LaVolpeButton4.Click
stopcrawl = 1
MsgBox(" Service has Stoped by USER !")
LaVolpeButton4.Enabled = False
Command1.Enabled = True
End Sub
Private Sub LaVolpeButton8_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles LaVolpeButton8.Click
Dim strData As String = ""
Dim strQ() As String
Dim intFile As Integer
' Set up the Common Dialog initial Directory
' to the application path. Filter for Text and All files
' Prompt the user to select a file
'
cmd.InitialDirectory = My.Application.Info.DirectoryPath
cmd.Filter = "Text Files(*.txt)|*.txt|All Files(*.*)|*.*"
cmd.Title = "Select Email Addresses File"
cmd.ShowDialog()
If cmd.FileName <> "" Then
'
' If the user selected a file
' open and read the entire contents
' then split it into records
'
intFile = FileSystem.FreeFile()
FileSystem.FileOpen(intFile, cmd.FileName, OpenMode.Input)
strData = FileSystem.InputString(intFile, FileSystem.LOF(intFile))
FileSystem.FileClose(intFile)
strQ = strData.Split(CChar(Environment.NewLine))
'
' Populate the texbox array with the questions
' (either the number of textboxes in the control array or
' number of questions, which ever is the smaller)
For Each strQ_item As String In strQ
ListBox2.Items.Add(strQ_item)
Next strQ_item
txtStartUrl.Text = ListBox2.Text
End If
End Sub
End Class
2- This is Visual basic 6 Code that works perfect.
Dim baseurl As String ' var to store base url so we can build the full path
Dim dVisited As Dictionary ' Dictionary to hold visited urls
Dim dEmail As Dictionary ' dictionary to hold emails
Dim dnumber As Dictionary
Dim dweb As Dictionary
Sub crawl()
While List1.ListCount > 0 ' loop while list has data
If stopcrawl = 1 Then GoTo exitcrawl
getpage List1.List(0) ' This is the heart of the prog, except for the regx
' stuff in the form load event
List1.RemoveItem (0) ' remove item from list
Wend
List2.RemoveItem (List2.ListIndex)
If List2.ListIndex < List2.ListCount - 1 Then
List2.ListIndex = List2.ListIndex + 1
txtStartUrl = List2.Text
Command1_Click
Else
exitcrawl:
If List2.ListCount = 0 Then
MsgBox "Searching has completed !", vbInformation, "Done ! searching"
WebBrowser1.Stop
txtStartUrl.Locked = False
txtStartUrl.Text = ""
Label3.Caption = "-----------------------------------------------------------"
LaVolpeButton4.Enabled = False
Command1.Enabled = True
LaVolpeButton8.Enabled = True
LaVolpeButton9.Enabled = True
End If
End If
End Sub

Related

vb6 select whole word in string

I am trying to search a string without regex perhaps a bad idea !
The search is acting on a string of text in a RichText Box
If I search for "is" the first word in the String is "This"
The last two letters of "This" is are highlighted in vbRed
The string to be searched has two other occurrences of "is" and these are found and highlighted as expected
The question can I prevent the "is" in "This" from being found?
Private Sub btnSearch_Click()
Dim pos As Integer
Dim strToFind As String
Dim Y As Integer
Dim Ask As String
pos = 1
strToFind = tbSearch.Text
Do
strToFind = tbSearch.Text
pos = InStr(1, strToSearch, strToFind)
For Y = 1 To Len(strToSearch)
Ask = MsgBox("Yes Next Occurrence or No To Exit ?", vbYesNo, "Question")
If Ask = vbYes Then
lbOne.AddItem pos
tbAns.Text = pos
If pos = 0 Then
Exit Sub
End If
rtbOne.SelStart = pos - 1
rtbOne.SelLength = Len(strToFind)
rtbOne.SelColor = vbRed
pos = InStr(pos + 1, strToSearch, strToFind)
Else
tbAns.Text = "NO"
pos = InStr(pos + 1, strToSearch, strToFind)
tbAns.Text = pos
Exit Sub
End If
Next
Loop Until pos > 0
End Sub
Private Sub Form_Load()
strToSearch = "This is a lot of text that will be loaded in the lbText and we will search it is it a case sensative Search"
rtbOne.Text = strToSearch
tbSearch.Text = "is"
End Sub
If this is not possible a few suggestions on how to use regex
I know this much I need to add the Reference and this might be the
Pattern myRegExp.Pattern = "(.)\strToFind\b(.)"
TOM FindText accepts the tomMatchWord flag. Just use that. Don't screw aroung extracting text out of the control and then munching on it using slow scripting language crutches like RegEx.
I was intrigued by the idea of a "built-in" search, as suggested by Bob77 and Mark, so I put together code to implement this idea. The code uses a WinAPI call but is really pretty simple overall and supports moving forward and backward along with toggles for case sensitivity and whole words:
Option Explicit
Private Declare Function SendMessage Lib "user32" Alias "SendMessageA" (ByVal hwnd As Long, ByVal wMsg As Long, ByVal wParam As Long, lParam As Any) As Long
Private Const WM_USER = &H400&
Private Const EM_GETOLEINTERFACE = (WM_USER + 60)
Private Enum Direction
Forward = 1
Backward = -1
End Enum
Private Doc As ITextDocument
Private Sub Form_Load()
RichTextBox1.HideSelection = False
RichTextBox1.Text = "is This is a lot of text that will be loaded in the lbText and we will " & _
"search it is it a case sensative Search" & vbCr & vbCr & _
"is and IS and is"
SearchTerm.Text = "is"
Dim Unknown As IUnknown
SendMessage RichTextBox1.hwnd, EM_GETOLEINTERFACE, 0&, Unknown
Set Doc = Unknown
End Sub
Private Sub cmdForward_Click()
Match SearchTerm.Text, chkWhole.Value, chkCase.Value, Forward
End Sub
Private Sub cmdBack_Click()
Match SearchTerm.Text, chkWhole.Value, chkCase.Value, Backward
End Sub
Private Sub Match(ByVal SearchTerm As String, ByVal WholeWords As Integer, ByVal CaseSensitive As Integer, ByVal Direction As Direction)
Dim Flags As Long
Flags = 2 * WholeWords + 4 * CaseSensitive
Doc.Selection.FindText SearchTerm, Direction * Doc.Selection.StoryLength, Flags
End Sub
You will need to add a reference to RICHED20.dll using the "Browse..." button in Project|References.
You might find following InStrAll implementation based on VBScript.RegExp useful.
Option Explicit
Private Sub Form_Load()
Const STR_TEXT As String = "This is a lot of text that will be loaded in the lbText and we will search it is it a case sensative Search"
Dim vElem As Variant
For Each vElem In InStrAll(STR_TEXT, "is")
Debug.Print vElem, Mid$(STR_TEXT, vElem, 2)
Next
End Sub
Public Function InStrAll(sText As String, sSearch As String, Optional ByVal Compare As VbCompareMethod = vbTextCompare) As Variant
Dim lIdx As Long
Dim vRetVal As Variant
With CreateObject("VBScript.RegExp")
.Global = True
.IgnoreCase = (Compare <> vbBinaryCompare)
.Pattern = "[.*+?^${}()/|[\]\\]"
.Pattern = "\b" & .Replace(sSearch, "\$&") & "\b"
With .Execute(sText)
If .Count = 0 Then
vRetVal = Array()
ElseIf .Count = 1 Then
vRetVal = Array(.Item(0).FirstIndex + 1)
Else
ReDim vRetVal(0 To .Count - 1) As Variant
For lIdx = 0 To .Count - 1
vRetVal(lIdx) = .Item(lIdx).FirstIndex + 1
Next
End If
End With
End With
InStrAll = vRetVal
End Function
The point is first you have to escape the searched string (prefix all regex control symbols with backslash) and then wrap this escaped pattern with \bs before performing the "global" search for all matches.
The InStrAll function returns an array of indexes in the original text. It's up to your to impl actual color-coded highlighting in the RichTextBox control of your choice. (I would set the background color, not the foreground of the found snippet, if I had a choice. Notice how most browsers use yellow background for search results highlighting.)
For a non-Regex approach, try the following:
Option Explicit
Private Sub Form_Load()
RichTextBox1.Text = "is This is a lot of text that will be loaded in the lbText and we will " & _
"search it is it a case sensative Search" & vbCr & vbCr & _
"is and is and is"
End Sub
Private Sub Command1_Click()
Dim SearchTerm As String
Dim SearchIndex As Integer
SearchTerm = "is"
SearchIndex = 1
Do
SearchIndex = InStr(SearchIndex, RichTextBox1.Text, SearchTerm)
If isMatch(SearchIndex, SearchTerm) Then
RichTextBox1.SelStart = SearchIndex - 1
RichTextBox1.SelLength = Len(SearchTerm)
RichTextBox1.SelColor = vbRed
End If
If SearchIndex > 0 Then SearchIndex = SearchIndex + Len(SearchTerm)
Loop Until SearchIndex = 0
End Sub
Private Function isMatch(ByVal SearchIndex As Long, ByVal SearchTerm As String) As Boolean
If SearchIndex = 1 Then
If Mid(RichTextBox1.Text, SearchIndex + Len(SearchTerm), 1) = " " Then isMatch = True
ElseIf SearchIndex + Len(SearchTerm) >= Len(RichTextBox1.Text) Then
If Mid(RichTextBox1.Text, SearchIndex - 1, 1) = " " Then isMatch = True
ElseIf SearchIndex > 1 Then
If (Mid(RichTextBox1.Text, SearchIndex - 1, 1) = " " Or Mid(RichTextBox1.Text, SearchIndex - 1, 1) = vbCr) And Mid(RichTextBox1.Text, SearchIndex + Len(SearchTerm), 1) = " " Then isMatch = True
End If
End Function
As noted in the comments, the original code had limitations. The code now supports matches at the beginning and end of the text, along with embedded breaks. You will likely need to add more checks to the isMatch method.

Excel VBA RegEx that extracts numbers from price values in range (has commas, $ and -)

I have a field data extracted from a database which represents a range of values, but it's coming in Excel as a String format $86,000 - $162,000.
I need to extract the minimum value and the maximum value from each cell, so I need to extract the numeric portion of it, and ignore the $, - and the ,.
I've attached an image of the data I have, and the values I want to extract from it.
This is the closest pattern I got with RegEx, but I'ts not what I'm looking for.
Pattern = (\d+)(?:\.(\d{1,2}))?
Can anyone assist ?
Just wondering why Regex?
Function GetParts(priceRange As String) As Double()
Dim arr() As String
Dim parts() As Double
If InStr(1, priceRange, "-") > 0 Then
arr = Split(priceRange, "-")
ReDim parts(0 To UBound(arr))
Dim i As Long
For i = 0 To UBound(arr)
parts(i) = CDbl(Replace$(Replace$(Trim$(arr(i)), "$", ""), ",", ""))
Next i
End If
GetParts = parts
End Function
Sub test()
MsgBox GetParts("$14,000 - $1,234,567")(0) 'Minimum
End Sub
EDIT
Yet you could do this with regex to match the data string into the parts:
Function GetPartsRegEx(priceRange As String) As Variant
Dim arr() As Double
Dim pricePattern As String
pricePattern = "(\$?\d+[\,\.\d]*)"
'START EDIT
Static re As RegExp
If re Is Nothing Then
Set re = New RegExp
re.IgnoreCase = True
re.Global = True
re.Pattern = pricePattern & "\s*[\-]\s*" & pricePattern 'look for the pattern first
End If
Static nums As RegExp
If nums Is Nothing Then
Set nums = New RegExp
'to remove all non digits, except decimal point in case you have pennies
nums.Pattern = "[^0-9.]"
nums.Global = True
End If
'END EDIT
If re.test(priceRange) Then
ReDim arr(0 To 1) ' fill return array
arr(0) = CDbl(nums.Replace(re.Replace(priceRange, "$1"), ""))
arr(1) = CDbl(nums.Replace(re.Replace(priceRange, "$2"), ""))
Else
'do some error handling here
Exit Function
End If 'maybe throw error if no +ve test or
GetPartsRegEx = arr
End Function
Sub test()
MsgBox GetPartsRegEx("$1,005.45 - $1,234,567.88")(1)
End Sub
Here is quick Example Demo https://regex101.com/r/RTNlVF/1
Pattern "^\$(\d+\,\d+)\s\-\s\$(\d+\,\d+)"
Option Explicit
Private Sub Example()
Dim RegExp As New RegExp
Dim Pattern As String
Dim CelValue As String
Dim rng As Range
Dim Cel As Range
Set rng = ActiveWorkbook.Sheets("Sheet1" _
).Range("A2", Range("A9999" _
).End(xlUp))
For Each Cel In rng
DoEvents
Pattern = "^\$(\d+\,\d+)\s\-\s\$(\d+\,\d+)"
If Pattern <> "" Then
With RegExp
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = Pattern
End With
If RegExp.Test(Cel.Value) Then
' Debug.Print Cel.Value
Debug.Print RegExp.Replace(CStr(Cel), "$1")
Debug.Print RegExp.Replace(CStr(Cel), "$2")
End If
End If
Next
End Sub
Without a loop (but still no regex):
Sub Split()
With Columns("B:B")
.Replace What:="$", Replacement:=""
Application.CutCopyMode = False
.TextToColumns Destination:=Range("B1"), DataType:=xlDelimited, Other:=True, OtherChar:="-", FieldInfo:=Array(Array(1, 1), Array(2, 1))
End With
Columns("B:C").Insert Shift:=xlToRight
Columns("D:E").NumberFormat = "0"
Range("D1").FormulaR1C1 = "Min Value"
Range("E1").FormulaR1C1 = "Max Value"
With Range("D1:E1").Interior
.Pattern = xlSolid
.PatternColorIndex = xlAutomatic
.Color = 12611584
End With
With Range("D1:E1").Font
.ThemeColor = xlThemeColorDark1
.TintAndShade = 0
End With
End Sub
I made this function:
Hope it helps.
Code:
Function ExtractNumber(ByVal TextInput As String, _
Optional ByVal Position As Byte = 0, _
Optional ByVal Delimiter As String = "-") As Variant
' You can use this function in a subprocess that
' writes the values in the cells you want, or
' you can use it directly in the ouput cells
' Variables
Dim RemoveItems(2) As String
Dim Aux As Variant
' The variable RemoveItems is an array
' containing the characters you want to remove
RemoveItems(0) = "."
RemoveItems(1) = ","
RemoveItems(2) = " "
' STEP 1 - The variable Aux will store the text
' given as input
Aux = TextInput
' STEP 2 - Characters stored in the variable
' RemoveItems will be removed from Aux
For i = 0 To UBound(RemoveItems)
Aux = Replace(Aux, RemoveItems(i), "")
Next i
' STEP 3 - Once Aux is "clean", it will be
' transformed into an array containing the
' values separated by the delimiter
' As you can see at the function's header,
' Delimiter default value is "-". You can change
' it depending on the situation
Aux = Split(Aux, Delimiter)
' STEP 4 - The result of this function will be
' a numeric value. So, if the value of the
' selected position in Aux is not numeric it will
' remove the first character assuming it is a
' currency symbol.
' If something fails in the process the function
' will return "ERROR", so you can know you may
' verify the inputs or adjust this code for
' your needs.
On Error GoTo ErrHndl
If Not IsNumeric(Aux(Position)) Then
ExtractNumber = CLng(Mid(Aux(Position), 2))
Else
ExtractNumber = CLng(Aux(Position))
End If
Exit Function
ErrHndl:
ExtractNumber = "ERROR"
End Function
You can even do this with just worksheet formulas. Under certain circumstances, Excel will ignore the $ and ,. The double unary converts the returned string to a numeric value.
First Value: =--LEFT(A1,FIND("-",A1)-1)
Second Value: =--MID(A1,FIND("-",A1)+1,99)

Extract text from 2 strings from selected Outlook email

I have code to import email body data from Outlook to Excel. I only need Name, ID, code from the email.
I have done everything except to extract the ID from a fixed sentence:
cn=SVCLMCH,OU=Users,OU=CX,DC=dm001,DC=corp,DC=dcsa,DC=com
The id is SVCLMCH in this case, that means I need to extract the text between "cn=" and ",OU=Users".
Sub import_code()
Dim O As Outlook.Application
Set O = New Outlook.Application
Dim ONS As Outlook.Namespace
Set ONS = O.GetNamespace("MAPI")
Dim OMAIL As Outlook.MailItem
Set OMAIL = Nothing
Dim ws As Object
Set ws = ThisWorkbook.Worksheets("Import code from Outlook")
Dim rcount As Long
Dim vText As Variant
Dim sText As String
Dim i As Long
If O.ActiveExplorer.Selection.Count = 0 Then
msgbox "No Items selected!", vbCritical, "Error"
End If
On Error Resume Next
'Process each selected record
rcount = ws.UsedRange.Rows.Count
For Each OMAIL In O.ActiveExplorer.Selection
sText = OMAIL.Body
vText = Split(sText, Chr(13))
'Find the next empty line of the worksheet
rcount = rcount + 1
'Check each line of text in the message body
For i = UBound(vText) To 0 Step -1
If InStr(1, vText(i), "Password Generated and set for:") > 0 Then
vItem = Split(vText(i), Chr(58))
ws.Range("A" & rcount) = Trim(vItem(1))
End If
If InStr(1, vText(i), "cn=") > 0 Then
vItem = Split(vText(i), Chr(58))
ws.Range("b" & rcount) = Trim(vItem(1))
End If
If InStr(1, vText(i), "Password:") > 0 Then
vItem = Split(vText(i), Chr(58))
ws.Range("c" & rcount) = Trim(vItem(1))
End If
Next i
Next OMAIL
End Sub
The trick here is to use the Split() function
Dim Arr() As String
Dim j As Integer
Dim k As Integer
Dim strvar As String
Dim strval As String
Dim strID As String
If InStr(1, vtext(i), "cn=") > 0 Then
' split the whole line in an array - "," beeing the value separator
Arr = Split(vtext(i), ",")
' loop through all array elements
For j = 0 To UBound(r) - 1
' find the position of =
k = InStr(Arr(j), "=")
strvar = Left(Arr(j), k - 1) ' this is the variable name, ie "cn"
strval = Mid(Arr(j), k + 1) ' this is the value, ie "SVCLMCH"
' now do what you want with a specific variable
Select Case strvar
Case "cn"
strID = strval
Case Else
' do nothing
End Select
Next j
End If
you can use a helper function like this:
Function GetID(strng As String)
Dim el As Variant
For Each el In Split(strng, ",")
If InStr(1, el, "cn=") > 0 Then
GetID = Mid(el, InStr(1, el, "cn=") + 3)
Exit Function
End If
Next
End Function
and your main code would exploit it as:
If InStr(1, vText(i), "cn=") > 0 Then ws.Range("b" & rcount) = GetID(CStr(vText(i)))
Use Regular Expression extract the ID from the sentence
Example Pattern = "(?<=cn=)([^.]+)(?=\,OU=Users)"
https://regex101.com/r/67u84s/2
Code Example
Option Explicit
Private Sub Examplea()
Dim Matches As Variant
Dim RegEx As Object
Set RegEx = CreateObject("VbScript.RegExp")
Dim olApp As Object
Set olApp = CreateObject("Outlook.Application")
Dim Item As Outlook.MailItem
Set Item = olApp.ActiveExplorer.Selection.Item(1)
Dim Pattern As String
Pattern = "(?<=cn=)([^.]+)(?=\,OU=Users)"
With RegEx
.Global = False
.Pattern = Pattern
.IgnoreCase = True
Set Matches = .Execute(Item.Body)
End With
If Matches.Count > 0 Then
Debug.Print Matches(0).SubMatches(0)
With ThisWorkbook.Sheets("Sheet1")
.Range("A1").Value = Trim(Matches(0).SubMatches(0))
End With
End If
End Sub

Parse a String in Excel Vba

I have a macro that send an XMLHTTP request to a server and it gets as response a plain text string, not a JSON format string or other standard formats (at least for what I know).
I would like to parse the output string in order to access the data in an structured approach in the same fashion as the parseJson subroutine in this link
My problem is I am not good with regular expressions and I am not able to modify the routine for my needs.
The string that I need to parse has the following structure:
The string is a single line
Each single parameter is defined by its parameter name the equal simbol, its value and ending with; "NID=3;" or "SID=Test;"
Parameter can be collected in "structures" starts and end with the symbol | and they are identified with their name followed by ; such as |STEST;NID=3;SID=Test;|
A structure can contain also other structures
An example of a output string is the following
|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|
In this case there is a macro structure KC which contains a structure AD. The structure AD is composed by the parameters PE, PF and 2 structures CD. And finaly the structures CD have the parameters PE and HP
So I would like to parse the string to obtain an Object/Dictionary that reflects this structure, can you help me?
Adds after the first answers
Hi all, thank you for your help, but I think I should make more clear the output that I would like to get.
For the example string that I have, I would like to have an object with the following structure:
<KC>
<AD>
<PE>5</PE>
<PF>3</PF>
<CD>
<PE>5</PE>
<HP>test</HP>
</CD>
<CD>
<PE>3</PE>
<HP>abc</HP>
</CD>
</AD>
</KC>
So I started to wrote a possible working code base on some hint from #Nvj answer and the answer in this link
Option Explicit
Option Base 1
Sub Test()
Dim strContent As String
Dim strState As String
Dim varOutput As Variant
strContent = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|"
Call ParseString(strContent, varOutput, strState)
End Sub
Sub ParseString(ByVal strContent As String, varOutput As Variant, strState As String)
' strContent - source string
' varOutput - created object or array to be returned as result
' strState - Object|Array|Error depending on processing to be returned as state
Dim objTokens As Object
Dim lngTokenId As Long
Dim objRegEx As Object
Dim bMatched As Boolean
Set objTokens = CreateObject("Scripting.Dictionary")
lngTokenId = 0
Set objRegEx = CreateObject("VBScript.RegExp")
With objRegEx
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = "\|[A-Z]{2};" 'Pattern for the name of structures
Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "str"
.Pattern = "[A-Z]{2}=[^\|=;]+;" 'Pattern for parameters name and values
Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "par"
End With
End Sub
Sub Tokenize(objTokens, objRegEx, strContent, lngTokenId, bMatched, strType)
Dim strKey As String
Dim strKeyPar As String
Dim strKeyVal As String
Dim strWork As String
Dim strPar As String
Dim strVal As String
Dim strLevel As String
Dim strRes As String
Dim lngCopyIndex As Long
Dim objMatch As Object
strRes = ""
lngCopyIndex = 1
With objRegEx
For Each objMatch In .Execute(strContent)
If strType = "str" Then
bMatched = True
With objMatch
strWork = Replace(.Value, "|", "")
strWork = Replace(strWork, ";", "")
strLevel = get_Level(strWork)
strKey = "<" & lngTokenId & strLevel & strType & ">"
objTokens(strKey) = strWork
strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey
lngCopyIndex = .FirstIndex + .Length + 1
End With
lngTokenId = lngTokenId + 1
ElseIf strType = "par" Then
strKeyPar = "<" & lngTokenId & "par>"
strKeyVal = "<" & lngTokenId & "val>"
strKey = strKeyPar & strKeyVal
bMatched = True
With objMatch
strWork = Replace(.Value, ";", "")
strPar = Split(strWork, "=")(0)
strVal = Split(strWork, "=")(1)
objTokens(strKeyPar) = strPar
objTokens(strKeyVal) = strVal
strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey
lngCopyIndex = .FirstIndex + .Length + 1
End With
lngTokenId = lngTokenId + 2
End If
Next
strContent = strRes & Mid(strContent, lngCopyIndex, Len(strContent) - lngCopyIndex + 1)
End With
End Sub
Function get_Level(strInput As String) As String
Select Case strInput
Case "KC"
get_Level = "L1"
Case "AD"
get_Level = "L2"
Case "CD"
get_Level = "L3"
Case Else
MsgBox ("Error")
End
End Select
End Function
This function creates a dictionary with an item for each structure name, parameter name and parameter value as shown in the figure
Thanks to the function get_Level the items associated to structures have a level that should help to preserve the original hierarchy of the data.
So what I am missing is a function to create an object that has the original structure of the input string. This is what the Retrieve function do in this answer link, but I do not know how to adapt it to my case
This looks like a simple nested delimited string. A couple of Split() functions will do the trick:
Option Explicit
Function parseString(str As String) As Collection
Dim a1() As String, i1 As Long, c1 As Collection
Dim a2() As String, i2 As Long, c2 As Collection
Dim a3() As String
a1 = Split(str, "|")
Set c1 = New Collection
For i1 = LBound(a1) To UBound(a1)
If a1(i1) <> "" Then
Set c2 = New Collection
a2 = Split(a1(i1), ";")
For i2 = LBound(a2) To UBound(a2)
If a2(i2) <> "" Then
a3 = Split(a2(i2), "=")
If UBound(a3) > 0 Then
c2.Add a3(1), a3(0)
ElseIf UBound(a3) = 0 Then
c2.Add a3(0)
End If
End If
Next i2
c1.Add c2
End If
Next i1
Set parseString = c1
End Function
Sub testParseString()
Dim c As Collection
Set c = parseString("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")
Debug.Assert c(1)(1) = "KC"
Debug.Assert c(2)("PE") = "5"
Debug.Assert c(3)(1) = "CD"
Debug.Assert c(4)("HP") = "abc"
Debug.Assert c(4)(3) = "abc"
End Sub
Note that you can address values by both, index and key (if key existed in the input). If key was not provided you can only access the value by its index. You can also iterate collection recursively to get all the values in a tree structure.
Food for thought: since your structures may have repeated names (in your case "CD" structure happens twice) Collections / Dictionaries would find it problematic to store this elegantly (due to key collisions). Another good way to approach this is to create an XML structure with DOMDocument and use XPath to access its elements. See Program with DOM in Visual Basic
UPDATE: I've added XML example below as well. Have a look.
Here is another take on your string parsing issue using DOMDocument XML parser. You need to include Microsoft XML, v.6.0 in your VBA references.
Function parseStringToDom(str As String) As DOMDocument60
Dim a1() As String, i1 As Long
Dim a2() As String, i2 As Long
Dim a3() As String
Dim dom As DOMDocument60
Dim rt As IXMLDOMNode
Dim nd As IXMLDOMNode
Set dom = New DOMDocument60
dom.async = False
dom.validateOnParse = False
dom.resolveExternals = False
dom.preserveWhiteSpace = True
Set rt = dom.createElement("root")
dom.appendChild rt
a1 = Split(str, "|")
For i1 = LBound(a1) To UBound(a1)
If a1(i1) <> "" Then
a2 = Split(a1(i1), ";")
Set nd = dom.createElement(a2(0))
For i2 = LBound(a2) To UBound(a2)
If a2(i2) <> "" Then
a3 = Split(a2(i2), "=")
If UBound(a3) > 0 Then
nd.appendChild dom.createElement(a3(0))
nd.LastChild.Text = a3(1)
End If
End If
Next i2
rt.appendChild nd
End If
Next i1
Set parseStringToDom = dom
End Function
Sub testParseStringToDom()
Dim dom As DOMDocument60
Set dom = parseStringToDom("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")
Debug.Assert Not dom.SelectSingleNode("/root/KC") Is Nothing
Debug.Assert dom.SelectSingleNode("/root/AD/PE").Text = "5"
Debug.Assert dom.SelectSingleNode("/root/CD[1]/HP").Text = "test"
Debug.Assert dom.SelectSingleNode("/root/CD[2]/HP").Text = "abc"
Debug.Print dom.XML
End Sub
As you can see this converts your text into an XML DOM document preserving all the structures and allowing for duplicates in naming. You can then use XPath to access any node or value. This can also be extended to have more nesting levels and further structures.
This is the XML document it creates behind the scenes:
<root>
<KC/>
<AD>
<PE>5</PE>
<PF>3</PF>
</AD>
<CD>
<PE>5</PE>
<HP>test</HP>
</CD>
<CD>
<PE>3</PE>
<HP>abc</HP>
</CD>
</root>
I've started to write a parser in VBA for the string structure specified by you, and it's not complete, but I'll post it anyways. Maybe you can pick up some ideas from it.
Sub ParseString()
Dim str As String
str = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|"
' Declare an object dictionary
' Make a reference to Microsoft Scripting Runtime in order for this to work
Dim dict As New Dictionary
' If the bars are present in the first and last character of the string, replace them
str = Replace(str, "|", "", 1, 1)
If (Mid(str, Len(str), 1) = "|") Then
str = Mid(str, 1, Len(str) - 1)
End If
' Split the string by bars
Dim substring_array() As String
substring_array = Split(str, "|")
' Declare a regex object
' Check the reference to Microsoft VBScript Regular Expressions 5.5 in order for this to work
Dim regex As New RegExp
With regex
.Global = True
.IgnoreCase = True
.MultiLine = True
End With
' Object to store the regex matches
Dim matches As MatchCollection
Dim param_name_matches As MatchCollection
Dim parameter_value_matches As MatchCollection
' Define some regex patterns
pattern_for_structure_name = "^[^=;]+;"
pattern_for_parameters = "[^=;]+=[^=;]+;"
pattern_for_parameter_name = "[^=;]="
pattern_for_parameter_val = "[^=;];"
' Loop through the elements of the array
Dim i As Integer
For i = 0 To UBound(substring_array) - LBound(substring_array)
' Get the array element in a string
str1 = substring_array(i)
' Check if it contains a structure name
regex.Pattern = pattern_for_structure_name
Set matches = regex.Execute(str1)
If matches.Count = 0 Then
' This substring does not contain a structure name
' Check if it contains parameters
regex.Pattern = pattern_for_parameter
Set matches = regex.Execute(matches(0).Value)
If matches.Count = 0 Then
' There are no parameters as well as no structure name
' This means the string had || - invalid string
MsgBox ("Invalid string")
Else
' The string contains parameter names
' Add each parameter name to the dictionary
Dim my_match As match
For Each my_match In matches
' Get the name of the parameter
regex.Pattern = pattern_for_parameter_name
Set parameter_name_matches = regex.Execute(my_match.Value)
' Check if the above returned any matches
If parameter_name_matches.Count = 1 Then
' Remove = sign from the parameter name
parameter_name = Replace(parameter_name_matches(0).Value, "=", "")
' Get the value of the parameter
regex.Pattern = pattern_for_parameter_value
Set parameter_value_matches = regex.Execute(my_match.Value)
' Check if the above returned any matches
If parameter_value_matches.Count = 1 Then
' Get the value
parameter_value = Replace(parameter_value_matches(0).Value, ";", "")
' Add the parameter name and value as a key pair to the Dictionary object
dict.Item(parameter_name) = parameter_value
Else
' Number of matches is either 0 or greater than 1 - in both cases the string is invalid
MsgBox ("Invalid string")
End If
Else
' Parameter name did not match - invalid string
MsgBox ("Invalid string")
End If
Next
End If
ElseIf matches.Count = 1 Then
' This substring contains a single structure name
' Check if it has parameter names
Else
' This substring contains more than one structure name - the original string is invalid
MsgBox ("Invalid string")
End If
Next i
End Sub

VBA Regex Pattern: failing when underscore in my pattern

Looking for some help with my macro that loops through subfolders and brings back data from the workbooks that match my filename pattern, because the name changes each month.
It works seamlessly if the pattern is "[0-9][0-9][0-9][0-9][0-9][0-9] Filename"
But fails if "[0-9][0-9][0-9][0-9]_[0-9][0-9] Filename"
Any ideas on how to handle the underscore please?
This fails "[0-9][0-9][0-9][0-9][_][0-9][0-9] Filename"
Thanks heaps
GWS
Option Explicit
Option Base 1
Private Const PORTFOLIO_CODE As String = "G030"
Private Sub ExtractData()
' get workbook list
Dim wbList As Collection
Set wbList = New Collection
Application.DisplayAlerts = False
RecursiveFileSearch _
"O:\Sales and Marketing\Monthly Reports\", _
"[0-9][0-9][0-9][0-9][_][0-9][0-9] Monthly Report.xlsm", _ 'fails to find any workbooks
'"[0-9][0-9][0-9][0-9][0-9][0-9] Monthly Report.xlsm", _ 'would work except my file names contain underscores
wbList
Dim resultOffset As Integer
wsResult.Name = result
resultOffset = 1
Dim wbName As Variant, wbOpen As Workbook, wsFund As Worksheet
For Each wbName In wbList
' loop through workbook list
' - open workbook, hidden
Application.ScreenUpdating = False
Set wbOpen = Workbooks.Open(Filename:=wbName, ReadOnly:=True)
wbOpen.Windows(1).Visible = False
' - get worksheet for fund
Set wsFund = wbOpen.Worksheets(PORTFOLIO_CODE)
Application.ScreenUpdating = True
' - find top of data
Dim valueDate As Date
valueDate = WorksheetFunction.EoMonth(DateSerial(2000 + CInt(Left(wbOpen.Name, 2)), CInt(Mid(wbOpen.Name, 3, 2)), 1), 0)
Debug.Print valueDate, wbOpen.Name
ThisWorkbook.Worksheets(PORTFOLIO_CODE).Activate
Dim baseData As Excel.Range
Set baseData = wsFund.Range("AQ:AQ").Find("Currency")
If Not baseData Is Nothing Then
' - loop through data
Dim rowOffset As Integer
rowOffset = 0
wsResult.Range("A1").Offset(resultOffset, 0).Value = valueDate ' baseData.Offset(rowOffset, 0).Value
wsResult.Range("A1").Offset(resultOffset, 1).Value = baseData.Offset(rowOffset, 0).Value
wsResult.Range("A1").Offset(resultOffset, 2).Value = baseData.Offset(rowOffset, 5).Value
resultOffset = resultOffset + 1
End If
' - close workbook
wbOpen.Close SaveChanges:=False
DoEvents
Next
Application.DisplayAlerts = True
End Sub
RecursiveFileSearch
Sub RecursiveFileSearch( _
ByVal targetFolder As String, _
ByRef filePattern As String, _
ByRef matchedFiles As Collection _
)
Dim oRegExp As New VBScript_RegExp_55.RegExp
oRegExp.Global = False
oRegExp.IgnoreCase = True
oRegExp.MultiLine = False
oRegExp.Pattern = filePattern
Dim oFSO As Scripting.FileSystemObject
Set oFSO = New Scripting.FileSystemObject
'Get the folder oect associated with the target directory
Dim oFolder As Variant
Set oFolder = oFSO.GetFolder(targetFolder)
'Loop through the files current folder
Dim oFile As Variant
For Each oFile In oFolder.Files
If oRegExp.test(oFile.Name) Then
matchedFiles.Add oFile
End If
Next
'Loop through the each of the sub folders recursively
Dim oSubFolders As Object
Set oSubFolders = oFolder.Subfolders
Dim oSubfolder As Variant
For Each oSubfolder In oSubFolders
RecursiveFileSearch oSubfolder, filePattern, matchedFiles
Next
'Garbage Collection
Set oFolder = Nothing
Set oFile = Nothing
Set oSubFolders = Nothing
Set oSubfolder = Nothing
Set oFSO = Nothing
Set oRegExp = Nothing
End Sub
Perhaps:
\d{4}_\d{2}.*Monthly Report\.xlsm
My code was escaping the () and . to override the defined regex behavior. Portland Runner suggestion solved the question. ^[0-9]{3,4}[_][0-9]{2} SAMPSON International Shares Passive (Hedged) Trust Mandate Monthly Report.xlsm