As per my requirement i need to check three conditions using regular expression.
'Find the dot position
dotPos = InStr(editFormat, ".")
'Find whole number of digits
wholeNum = Left$(editFormat, dotPos - 1)
'Find decimal number of digits
decNum = Mid(editFormat, dotPos + 1, strLen - dotPos - 1)
regularExp = "^[-]{0,1}[0-9]{0," & wholeNum & "}\" & DecSep & "[0-9]{0," & decNum & "}$"
Here, i need to validate
1) whole value(Before dot)
2) decimal value(After dot)
3) whole length of the input.
wholeNum = 30
decNum = 20
Ex: The value is 123456789012345678901234567890.12345678901234567890
As per my code this two conditions are working Fine.
But i need to add one more condition is Total length should be 40.
Ex: Possible inputs for your example: ( (30.10) or (20.20) or (25.15) )
1) 123456789012345678901234567890.1234567890 (Total should be 40)
2) 12345678901234567890.12345678901234567890
3) 1234567890123456789012345.678901234567890
How to add that condition in my code.
Thanks.
I'd use a lookahead assertion:
regularExp = "^" & "(?=.{40}$)" & "[-]{0,1}[0-9]{0," & wholeNum & "}\" & DecSep & "[0-9]{0," & decNum & "}$"
assuming you're counting including the dot.
Related
I continue trying to perform string format matching using RegExp in VBScript & VB6. I am now trying to match a short, single-line string formatted as:
Seven characters:
a. Six alphanumeric plus one "-" OR
b. Five alphanumeric plus two "-"
Three numbers
Two letters
Literal "65"
A two-digit hex number.
Examples include 123456-789LM65F2, 4EF789-012XY65A5, A2345--789AB65D0 & 23456--890JK65D0.
The RegExp pattern ([A-Z0-9\-]{12})([65][A-F0-9]{2}) lumps (1) - (3) together and finds these OK.
However, if I try to:
c) Break (3) out w/ pattern ([A-Z0-9\-]{10})([A-Z]{2})([65][A-F0-9]{2}),
d) Break out both (2) & (3) w/ pattern ([A-Z0-9\-]{7})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2}), or
e) Tighten up (1) with alternation pattern ([A-Z0-9]{5}[-]{2}|[A-Z0-9]{6}[-]{1})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2})
it refuses to find any of them.
What am I doing wrong? Following is a VBScript that runs and checks these.
' VB Script
Main()
Function Main() ' RegEx_Format_sample.vbs
'Uses two paterns, TestPttn for full format accuracy check & SplitPttn
'to separate the two desired pieces
Dim reSet, EtchTemp, arrSplit, sTemp
Dim sBoule, sSlice, idx, TestPttn, SplitPttn, arrMatch
Dim arrPttn(3), arrItems(3), idxItem, idxPttn, Msgtemp
Set reSet = New RegExp
' reSet.IgnoreCase = True ' Not using
' reSet.Global = True ' Not using
' load test case formats to check & split
arrItems(0) = "0,6 nums + 1 '-',123456-789LM65F2"
arrItems(1) = "1,6 chars + 1 '-',4EF789-012XY65A5"
arrItems(2) = "2,5 chars + 2 '-',A2345--789AB65D0"
arrItems(3) = "3,5 nums + 2 '-',23456--890JK65D0"
SplitPttn = "([A-Z0-9]{5,6})[-]{1,2}([A-Z0-9]{9})" ' split pattern has never failed to work
' load the patterns to try
arrPttn(0) = "([A-Z0-9\-]{12})([65][A-F0-9]{2})"
arrPttn(1) = "([A-Z0-9\-]{10}[A-Z]{2})([65][A-F0-9]{2})"
arrPttn(2) = "([A-Z0-9\-]{7})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2})"
arrPttn(3) = "([A-Z0-9]{5}[-]{2}|[A-Z0-9]{6}[-]{1})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2})"
For idxPttn = 0 To 3 ' select Test pattern
TestPttn = arrPttn(idxPttn)
TestPttn = TestPttn & "[%]" ' append % "ender" char
SplitPttn = SplitPttn & "[%]" ' append % "ender" char
For idxItem = 0 To 3
reSet.Pattern = TestPttn ' set to Test pattern
sTemp = arrItems(idxItem )
arrSplit = Split(sTemp, ",") ' arrSplit is Split array
EtchTemp = arrSplit(2) & "%" ' append % "ender" char to Item sub (2) as the "phrase" under test
If reSet.Test(EtchTemp) = False Then
MsgBox("RegEx " & TestPttn & " false for " & EtchTemp & " as " & arrSplit(1) )
Else ' test OK; now switch to SplitPttn
reSet.Pattern = SplitPttn
Set arrMatch = reSet.Execute(EtchTemp) ' run Pttn as Exec this time
If arrMatch.Count > 0 then ' If test OK then Count s/b > 0
Msgtemp = ""
Msgtemp = "RegEx " & TestPttn & " TRUE for " & EtchTemp & " as " & arrSplit(1)
For idx = 0 To arrMatch.Item(0).Submatches.Count - 1
Msgtemp = Msgtemp & Chr(13) & Chr(10) & "Split segment " & idx & " as " & arrMatch.Item(0).submatches.Item(idx)
Next
MsgBox(Msgtemp)
End If ' Count OK
End If ' test OK
Next ' idxItem
Next ' idxPttn
End Function
Try this Regex:
(?:[A-Z0-9]{6}-|[A-Z0-9]{5}--)[0-9]{3}[A-Z]{2}65[0-9A-F]{2}
Click for Demo
Explanation:
(?:[A-Z0-9]{6}-|[A-Z0-9]{5}--) - matches either 6 Alphanumeric characters followed by a - or 5 Alphanumeric characters followed by a --
[0-9]{3} - matches 3 Digits
[A-Z]{2} - matches 2 Letters
65 - matches 65 literally
[0-9A-F]{2} - matches 2 HEX symbols
You can get some idea from the following code:
VBScript Code:
Option Explicit
Dim objReg, strTest
strTest = "123456-789LM65F2" 'Change the value as per your requirements. You can also store a list of values in an array and run the code in loop
set objReg = new RegExp
objReg.Global = True
objReg.IgnoreCase = True
objReg.Pattern = "(?:[A-Z0-9]{6}-|[A-Z0-9]{5}--)[0-9]{3}[A-Z]{2}65[0-9A-F]{2}"
if objReg.test(strTest) then
msgbox strTest&" matches with the Pattern"
else
msgbox strTest&" does not match with the Pattern"
end if
set objReg = Nothing
Your patterns do not work because:
([A-Z0-9\-]{12})([65][A-F0-9]{2}) - matches 12 occurrences of either an AlphaNumeric character or - followed by either 6 or 5 followed by 2 HEX characters
([A-Z0-9\-]{10}[A-Z]{2})([65][A-F0-9]{2}) - matches 10 occurrences of either an AlphaNumeric character or - followed by 2 Letters followed by either 6 or 5 followed by 2 HEX characters
([A-Z0-9\-]{7})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2}) - matches 7 occurrences of either an AlphaNumeric character or - followed by 3 digits followed by 2 Letters followed by either 6 or 5 followed by 2 HEX characters
([A-Z0-9]{5}[-]{2}|[A-Z0-9]{6}[-]{1})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2}) - matches either 5 occurrences of an AlphaNumeric character followed by -- or 6 occurrences of an Alphanumeric followed by a -. This is then followed by 3 digits followed by 2 Letters followed by either 6 or 5 followed by 2 HEX characters
Try this pattern :
(([A-Z0-9]{5}--)|([A-Z0-9]{6}-))[0-9]{3}[A-Z]{2}65[0-9A-F]{2}
Or, if the last part doesn't like the [A-F]
(([A-Z0-9]{5}--)|([A-Z0-9]{6}-))[0-9]{3}[A-Z]{2}65[0-9ABCDEF]{2}
All, tanx again for your help!!
trincot, everything in each arrItems() between the commas, incl the the "plus", is merely part of a shorthand description of each item's characteristics, such as "5 characters plus 2 dashes".
Gurman, your pttn breakdowns were helpful, but, if I read it right, the addition of the ? prefix is a "Match zero or one occurrences" and this must match exactly one occurrence. Also, my 1st pattern (matches 12) actually DID work for all my test cases.
jNevill, & JMichelB your suggestions are very close to what I ended up with.
I was "over-classing". After some tinkering, I was able to get the Test Pttn to successfully recognize these test cases by taking the [65] out of the [] in my original Alternation pattern. That is I went from ([65]) to (65) and Zammo! it worked.
Orig pattern:
([A-Z0-9]{5}[-]{2}|[A-Z0-9]{6}[-]{1})([0-9]{3})([A-Z]{2})([65][A-F0-9]{2})
Wkg pattern:
([A-Z0-9]{5}[-]{2}|[A-Z0-9]{6}[-]{1})([0-9]{3})([A-Z]{2})(65)([A-F0-9]{2})
Oh, and I moved the
SplitPttn = SplitPttn & "[%]" ' append % "ender" char
stmt up out of the For...Next loop. That helped w/ the splitting.
T-Bone
I have a function I've written to handle calculations of percent reductions with significant digits, and I'm having a problem with keeping trailing zeroes.
The function:
Function RegexReduction(IValue As Double, EValue As Double) As String
Dim TempPercent As Double
Dim TempString As String
Dim NumFormat As String
Dim DecPlaces As Long
Dim regex As Object
Dim rxMatches As Object
TempPercent = (1 - EValue / IValue)
NumFormat = "0"
Set regex = CreateObject("VBScript.RegExp")
With regex
.Pattern = "([^1-8])*[0-8]{1}[0-9]?"
.Global = False
End With
Set rxMatches = regex.Execute(CStr(TempPercent))
If rxMatches.Count <> 0 Then
TempString = rxMatches.Item(0)
DecPlaces = Len(Split(TempString, ".")(1)) - 2
If DecPlaces > 0 Then NumFormat = NumFormat & "." & String(DecPlaces, "0")
End If
RegexReduction = Format(TempPercent, NumFormat & "%")
End Function
This trims percentages to two digits after any leading zeroes or nines:
99.999954165% -> 99.99954%
34.564968% -> 35%
0.000516% -> 0.00052%
The one problem I've found isn't related to the regex, but to Excel's rounding:
99.50% -> 99.5%
Is there a solution that will save trailing zeroes that could be implemented here?
I suggest a version of your function that uses LTrim in combination with Replace instead of the (costly) regular expression to calculate the value of DecPlaces. The calculation of DecPlaces has become a "one-liner".
The rest of the code is the same except for the additional call to CDec to avoid CStr from returning a scientific notation (like 1.123642E-12) when the value is tiny.
Function Reduction(IValue As Double, EValue As Double) As String
Dim TempPercent As Double
Dim TempString As String
Dim NumFormat As String
Dim DecPlaces As Long
TempPercent = (1 - EValue / IValue)
' Apply CDec so tiny numbers do not get scientific notation
TempString = CStr(CDec(TempPercent))
' Count number of significant digits present by trimming away all other chars,
' and subtract from total length to get number of decimals to display
DecPlaces = Len(TempString) - 2 - _
Len(LTrim(Replace(Replace(Replace(TempString, "0"," "), "9"," "), "."," ")))
' Prepare format of decimals, if any
If DecPlaces > 0 Then NumFormat = "." & String(DecPlaces, "0")
' Apply format
Reduction = Format(TempPercent, "0" & NumFormat & "%")
End Function
It is assumed that TempPercent evaluates to a value between 0 and 1.
Comments on your code
You wrote:
The one problem I've found isn't related to the regex, but to Excel's rounding:
99.50% -> 99.5%
This is actually not related to Excel's rounding. In your code the following
DecPlaces = Len(Split(TempString, ".")(1)) - 2
will evaluate to Len(Split("0.995", ".")(1)) - 2, which is 1, and so the format you apply is 0.0%, explaining the output you get.
Also realise that although you have a capturing group in your regular expression, you do not actually use it. rxMatches.Item(0) will give you the complete matched string, not only the match with the capture group.
You apply a number format of 0% for the case the regular expression does not yield a match. Any number that has no other digits than 0 and 9 will not match. For instance 0.099 should be displayed with format 0.000% to give 9.900, but the format used is 0% as you have no Else block treating this case.
Finally, CStr can turn numbers into scientific notation, which will give wrong results as well. It seems with CDec this can be avoided.
Here is a UDF that attempts to 'read' the incoming raw (non-percentage) value in order to determine the number of decimal places to include.
Function udf_Specific_Scope(rng As Range)
Dim i As Long, str As String
str = rng.Value2 'raw value is 0.999506 for 99.9506%
For i = 1 To Len(str) - 1
If Asc(Mid(str, i, 1)) <> 48 And _
Asc(Mid(str, i, 1)) <> 57 And _
Asc(Mid(str, i, 1)) <> 46 Then _
Exit For
Next i
If InStr(1, str, Chr(46)) < i - 1 Then
udf_Specific_Scope = Val(Format(rng.Value2 * 100, "0." & String(i - 3, Chr(48)))) & Chr(37)
Else
udf_Specific_Scope = Format(rng.Value2, "0%")
End If
End Function
The disadvantage here is removing the numerical value from the cell entry but that mirrors your original RegEx method. Ideally, something like the above could be written as a sub based on the Application.Selection property. Just highlight (aka Select) some cells, run the sub and it assigns a cell number format with the correct number of decimals to each in the selection.
I am working in excel and need VBA code to extract 3 specific number patterns. In column A I have several rows of strings which include alphabetical characters, numbers, and punctuation. I need to remove all characters except those found in a 13-digit number (containing only numbers), a ten-digit number (containing only numbers), or a 9-digit number immediately followed by an "x" character. These are isbn numbers.
The remaining characters should be separated by one, and only one, space. So, for the following string found in A1: "There are several books here, including 0192145789 and 9781245687456. Also, the book with isbn 045789541x is included. This book is one of 100000000 copies."
The output should be: 0192145789 9781245687456 045789541x
Note that the number 100000000 should not be included in the output because it does not match any of the three patterns mentioned above.
I'm not opposed to a excel formula solution as opposed to VBA, but I assumed that VBA would be cleaner. Thanks in advance.
Here's a VBA function that will do specifically what you've specified
Function ExtractNumbers(inputStr As String) As String
Dim outputStr As String
Dim bNumDetected As Boolean
Dim numCount As Integer
Dim numStart As Integer
numCount = 0
bNumDetected = False
For i = 1 To Len(inputStr)
If IsNumeric(Mid(inputStr, i, 1)) Then
numCount = numCount + 1
If Not bNumDetected Then
bNumDetected = True
bNumStart = i
End If
If (numCount = 9 And Mid(inputStr, i + 1, 1) = "x") Or _
numCount = 13 And Not IsNumeric(Mid(inputStr, i + 1, 1)) Or _
numCount = 10 And Not IsNumeric(Mid(inputStr, i + 1, 1)) Then
If numCount = 9 Then
outputStr = outputStr & Mid(inputStr, bNumStart, numCount) & "x "
Else
outputStr = outputStr & Mid(inputStr, bNumStart, numCount) & " "
End If
End If
Else
numCount = 0
bNumDetected = False
End If
Next i
ExtractNumbers = Trim(outputStr)
End Function
It's nothing fancy, just uses string functions to goes through your string one character at a time looking for sections of 9 digit numbers ending with x, 10 digit numbers and 13 digit numbers and extracts them into a new string.
It's a UDF so you can use it as a formula in your workbook
special character set: `~!##$%^&*()_-+={}[]\|:;""'<>,.?/
Is this the right way to search for items within that special character set?
Regex.IsMatch(Result.Text, "^[`-/]")
I always get an error.... Do I need to use ASCII codes? If so, how can I do this? Thank you in advance!
Regular expressions have a different escape syntax and rules than VB.NET. Since you're dealing with a regex string in your code, you have to make sure the string is escaped properly for regex and VB.NET.
In your example, the - needs to be escaped with a ...
Regex.IsMatch(Result.Text, "^[`\-/]")
To match any character in the provided string, try this...
Regex.IsMatch(Result.Text, "[`~!##\$%\^&\*\(\)_\-\+=\{\}\[\]\\\|:;""'<>,\.\?/]")
Try this:
[`~!##$%^&*()_+={}\[\]\\|:;""'<>,.?/-]
Working regex example:
http://regex101.com/r/aU3wO9
In a char set, only some of them need to be escaped. like ] and \ and if - is at the end it doesn't need to be escaped. I'm not sure about [ so I just escaped it anyway.
You can use an escape character in the string where there is a special character
Dim str() As Byte
Dim j, n As Long
Dim ContainsSpecialChar As Boolean
Dim TempVendorName As String
str = Trim(VendorName)
n = 1
For j = 0 To UBound(str) - 1 Step 2
If (str(j) > 32 And str(j) < 47) Or (str(j) > 57 And str(j) < 65) Or (str(j) > 90 And str(j) < 97) Or (str(j) > 122) Then
ContainsSpecialChar = True
TempVendorName = Left(VendorName, n - 1) + "\" + Mid(VendorName, n)
n = n + 1
End If
n = n + 1
Next
Let's suppose I have a data set of several hundred thousand strings (which happen to be natural language sentences, if it matters) which are each tagged with a certain "label". Each sentence is tagged with exactly one label, and there are about 10 labels, each with approximately 10% of the data set belonging to them. There is a high degree of similarity to the structure of sentences within a label.
I know the above sounds like a classical example of a machine learning problem, but I want to ask a slightly different question. Are there any known techniques for programatically generating a set of regular expressions for each label, which can successfully classify the training data while still generalizing to future test data?
I would be very happy with references to the literature; I realize that this will not be a straightforward algorithm :)
PS: I know that the normal way to do classification is with machine learning techniques like an SVM or such. I am, however, explicitly looking for a way to generate regular expressions. (I would be happy with with machine learning techniques for generating the regular expressions, just not with machine learning techniques for doing the classification itself!)
This problem is usually framed as how to generate finite automata from sets of strings, rather than regular expressions, though you can obviously generate REs from FAs since they are equivalent.
If you search around for automata induction, you should be able to find quite a lot of literature on this topic, including GA approaches.
So far as I know, this is the subject of current research in evolutionary computation.
Here are some examples:
See slides 40-44 at
https://cs.byu.edu/sites/default/files/Ken_De_Jong_slides.pdf
(slides exist as of the posting of this answer).
Also, see
http://www.citeulike.org/user/bartolialberto/article/10710768
for a more detailed review of a system presented at GECCO 2012.
Note: May this would help someway. This below function generates RegEx pattern for a given value of a and b. Where a and b both are alpha-strings. And the function would generate a fair RegEx pattern to match the range between a and b. The function would take only first three chars to produce the pattern and produces a result that might be something like starts-with() function in some language with hint of a general RegEx favor.
A simple VB.NET example
Public Function GetRangePattern(ByVal f_surname As String, ByVal l_surname As String) As String
Dim f_sn, l_sn As String
Dim mnLength% = 0, mxLength% = 0, pdLength% = 0, charPos% = 0
Dim fsn_slice$ = "", lsn_slice$ = ""
Dim rPattern$ = "^"
Dim alphas As New Collection
Dim tmpStr1$ = "", tmpStr2$ = "", tmpStr3$ = ""
'///init local variables
f_sn = f_surname.ToUpper.Trim
l_sn = l_surname.ToUpper.Trim
'///do null check
If f_sn.Length = 0 Or l_sn.Length = 0 Then
Return "-!ERROR!-"
End If
'///return if both equal
If StrComp(f_sn, l_sn, CompareMethod.Text) = 0 Then
Return "^" & f_sn & "$"
End If
'///return if 1st_name present in 2nd_name
If InStr(1, l_sn, f_sn, CompareMethod.Text) > 0 Then
tmpStr1 = f_sn
tmpStr2 = l_sn.Replace(f_sn, vbNullString)
If Len(tmpStr2) > 1 Then
tmpStr3 = "[A-" & tmpStr2.Substring(1) & "]*"
Else
tmpStr3 = tmpStr2 & "*"
End If
tmpStr1 = "^" & tmpStr1 & tmpStr3 & ".*$"
tmpStr1 = tmpStr1.ToUpper
Return tmpStr1
End If
'///initialize alphabets
alphas.Add("A", CStr(Asc("A")))
alphas.Add("B", CStr(Asc("B")))
alphas.Add("C", CStr(Asc("C")))
alphas.Add("D", CStr(Asc("D")))
alphas.Add("E", CStr(Asc("E")))
alphas.Add("F", CStr(Asc("F")))
alphas.Add("G", CStr(Asc("G")))
alphas.Add("H", CStr(Asc("H")))
alphas.Add("I", CStr(Asc("I")))
alphas.Add("J", CStr(Asc("J")))
alphas.Add("K", CStr(Asc("K")))
alphas.Add("L", CStr(Asc("L")))
alphas.Add("M", CStr(Asc("M")))
alphas.Add("N", CStr(Asc("N")))
alphas.Add("O", CStr(Asc("O")))
alphas.Add("P", CStr(Asc("P")))
alphas.Add("Q", CStr(Asc("Q")))
alphas.Add("R", CStr(Asc("R")))
alphas.Add("S", CStr(Asc("S")))
alphas.Add("T", CStr(Asc("T")))
alphas.Add("U", CStr(Asc("U")))
alphas.Add("V", CStr(Asc("V")))
alphas.Add("W", CStr(Asc("W")))
alphas.Add("X", CStr(Asc("X")))
alphas.Add("Y", CStr(Asc("Y")))
alphas.Add("Z", CStr(Asc("Z")))
'///populate max-min length values
mxLength = f_sn.Length
If l_sn.Length > mxLength Then
mnLength = mxLength
mxLength = l_sn.Length
Else
mnLength = l_sn.Length
End If
'///padding values
pdLength = mxLength - mnLength
f_sn = f_sn.PadRight(mxLength, "A")
'f_sn = f_sn.PadRight(mxLength, "~")
l_sn = l_sn.PadRight(mxLength, "Z")
'l_sn = l_sn.PadRight(mxLength, "~")
'///get a range like A??-B??
If f_sn.Substring(0, 1).ToUpper <> l_sn.Substring(0, 1).ToUpper Then
fsn_slice = f_sn.Substring(0, 3).ToUpper
lsn_slice = l_sn.Substring(0, 3).ToUpper
tmpStr1 = fsn_slice.Substring(0, 1) & fsn_slice.Substring(1, 1) & "[" & fsn_slice.Substring(2, 1) & "-Z]"
tmpStr2 = lsn_slice.Substring(0, 1) & lsn_slice.Substring(1, 1) & "[A-" & lsn_slice.Substring(2, 1) & "]"
tmpStr3 = "^(" & tmpStr1 & "|" & tmpStr2 & ").*$"
Return tmpStr3
End If
'///looping charwise
For charPos = 0 To mxLength
fsn_slice = f_sn.Substring(charPos, 1)
lsn_slice = l_sn.Substring(charPos, 1)
If StrComp(fsn_slice, lsn_slice, CompareMethod.Text) = 0 Then
rPattern = rPattern & fsn_slice
Else
'rPattern = rPattern & "("
If charPos < mxLength Then
Try
If Asc(fsn_slice) < Asc(lsn_slice) Then
tmpStr1 = fsn_slice & "[" & f_sn.Substring(charPos + 1, 1) & "-Z" & "]|"
If CStr(alphas.Item(Key:=CStr(Asc(fsn_slice) + 1))) < CStr(alphas.Item(Key:=CStr(Asc(lsn_slice) - 1))) Then
tmpStr2 = "[" & CStr(alphas.Item(Key:=CStr(Asc(fsn_slice) + 1))) & "-" & CStr(alphas.Item(Key:=CStr(Asc(lsn_slice) - 1))) & "]|"
Else
tmpStr2 = vbNullString
End If
tmpStr3 = lsn_slice & "[A-" & l_sn.Substring(charPos + 1, 1) & "]"
rPattern = rPattern & "(" & tmpStr1 & tmpStr2 & tmpStr3 & ").*$"
'MsgBox("f_sn:= " & f_sn & " -- l_sn:= " & l_sn & vbCr & rPattern)
Exit For
Else
Return "-#ERROR#-"
End If
Catch ex As Exception
Return "-|ERROR|-" & ex.Message
End Try
End If
End If
Next charPos
Return rPattern
End Function
And it is called as
?GetRangePattern("ABC","DEF")
produces this
"^(AB[C-Z]|DE[A-F]).*$"