AHK - How to use dynamic hotstring with a barcode scanner - regex

I need to trigger a subroutine when a serial number of a product has been scanned in with a barcode scanner. The serial number looks like this: 11NNNN22334. I then need to use the scanned in serial number as a variable.
I tried dynamic regular expression hotstrings library which I include below, but I can't make it work reliably using a barcode scanner (it's too fast). I don't want to slow down the barcode scanner. It either does not trigger the subroutine at all or leaves the first digit of the serial number behind after the subroutine been triggered. Any ideas?
Test:
MsgBox, %$1% ; THIS IS THE STRING THAT TRIGGERED THE SUBROUTINE
return
hotstrings("([0-9][0-9]NNNN[0-9][0-9][0-9][0-9][0-9])", "Test")
/*
Function: HotStrings
Dynamically adds regular expression hotstrings.
Parameters:
c - regular expression hotstring
a - (optional) text to replace hotstring with or a label to goto,
leave blank to remove hotstring definition from triggering an action
Examples:
> hotstrings("(B|b)tw\s", "%$1%y the way") ; type 'btw' followed by space, tab or return
> hotstrings("i)omg", "oh my god!") ; type 'OMG' in any case, upper, lower or mixed
> hotstrings("\bcolou?r", "rgb(128, 255, 0);") ; '\b' prevents matching with anything before the word, e.g. 'multicololoured'
License:
- RegEx Dynamic Hotstrings: Modified version by Edd
- Original: <http://www.autohotkey.net/~polyethene/#hotstrings>
- Dedicated to the public domain (CC0 1.0) <http://creativecommons.org/publicdomain/zero/1.0/>
*/
hotstrings(k, a = "", Options:="")
{
static z, m = "~$", m_ = "*~$", s, t, w = 2000, sd, d = "Left,Right,Up,Down,Home,End,RButton,LButton", f = "!,+,^,#", f_="{,}"
global $
If z = ; init
{
RegRead, sd, HKCU, Control Panel\International, sDecimal
Loop, 94
{
c := Chr(A_Index + 32)
If A_Index between 33 and 58
Hotkey, %m_%%c%, __hs
else If A_Index not between 65 and 90
Hotkey, %m%%c%, __hs
}
e = 0,1,2,3,4,5,6,7,8,9,Dot,Div,Mult,Add,Sub,Enter
Loop, Parse, e, `,
Hotkey, %m%Numpad%A_LoopField%, __hs
e = BS,Shift,Space,Enter,Return,Tab,%d%
Loop, Parse, e, `,
Hotkey, %m%%A_LoopField%, __hs
z = 1
}
If (a == "" and k == "") ; poll
{
q:=RegExReplace(A_ThisHotkey, "\*\~\$(.*)", "$1")
q:=RegExReplace(q, "\~\$(.*)", "$1")
If q = BS
{
If (SubStr(s, 0) != "}")
StringTrimRight, s, s, 1
}
Else If q in %d%
s =
Else
{
If q = Shift
return
Else If q = Space
q := " "
Else If q = Tab
q := "`t"
Else If q in Enter,Return,NumpadEnter
q := "`n"
Else If (RegExMatch(q, "Numpad(.+)", n))
{
q := n1 == "Div" ? "/" : n1 == "Mult" ? "*" : n1 == "Add" ? "+" : n1 == "Sub" ? "-" : n1 == "Dot" ? sd : ""
If n1 is digit
q = %n1%
}
Else If (GetKeyState("Shift") ^ !GetKeyState("CapsLock", "T"))
StringLower, q, q
s .= q
}
Loop, Parse, t, `n ; check
{
StringSplit, x, A_LoopField, `r
If (RegExMatch(s, x1 . "$", $)) ; match
{
StringLen, l, $
StringTrimRight, s, s, l
if !(x3~="i)\bNB\b") ; if No Backspce "NB"
SendInput, {BS %l%}
If (IsLabel(x2))
Gosub, %x2%
Else
{
Transform, x0, Deref, %x2%
Loop, Parse, f_, `,
StringReplace, x0, x0, %A_LoopField%, ¥%A_LoopField%¥, All
Loop, Parse, f_, `,
StringReplace, x0, x0, ¥%A_LoopField%¥, {%A_LoopField%}, All
Loop, Parse, f, `,
StringReplace, x0, x0, %A_LoopField%, {%A_LoopField%}, All
SendInput, %x0%
}
}
}
If (StrLen(s) > w)
StringTrimLeft, s, s, w // 2
}
Else ; assert
{
StringReplace, k, k, `n, \n, All ; normalize
StringReplace, k, k, `r, \r, All
Loop, Parse, t, `n
{
l = %A_LoopField%
If (SubStr(l, 1, InStr(l, "`r") - 1) == k)
StringReplace, t, t, `n%l%
}
If a !=
t = %t%`n%k%`r%a%`r%Options%
}
Return
__hs: ; event
hotstrings("", "", Options)
Return
}

You can try to speed up the hotkeys function by fiddling with SetBatchLines:
hotstrings(k, a = "", Options:="")
{
prevBatchlines := A_BatchLines
SetBatchLines, -1
... ; rest of function here
}
; reset to whatever it was
SetBatchLines, %prevBatchlines%
Return
__hs: ; event
hotstrings("", "", Options)
Return
}
Although usually not recommended (it's nonzero by default for a reason), sometimes it is the only way.

Maybe give this a shot, it's setup to wait for your required syntax:
code:=
for k, v in StrSplit("QWERTYUIOPASDFGHJKLZXCVBNM")
Hotkey, % "~" v, WaitForBarcode
Loop 10 {
Hotkey, % "~" (10-A_Index) "", WaitForBarcode
Hotkey, % "~Numpad" (10-A_Index) "", WaitForBarcode
}
return
FoundCode(var) {
MsgBox % "Caught code: " var
}
WaitForBarcode(){
global code
k:=SubStr(A_ThisHotkey,0)
code.=k
code:=(is(SubStr(code,1,2))=1)?k:(is(SubStr(code,3,4))=2)?k:(is(SubStr(code,7,5))=1)?k:(StrLen(code)=11)?FoundCode(code):code
}
is(var) {
if var is not digit
return 1
if var is not alpha
return 2
}
I have no way of testing it with any input device other than keyboard, maybe it will work, maybe not.
Alternative for looping through keys would be:
Loop 43
Hotkey, % "~" Chr(A_Index+47), Bar

At work we have a lot of USB barcode scanners that type the scan results to the keyboard buffer.
If you have access to the barcode scanner and it's a hardware scanner,
you usually can definde a prefix/postfix code the scanner has to send before the scan. Check your scanner manual, to set it you normally just scan a few barcodes.
If you define the prefix code as a hotkey you can then run code to capture the letters until the post fix.
A simple example on key capture is
Loop {
Input, key, I L1 V
log = %log%%key%
}
#s::MsgBox, 64, Key History, %log%
It should be easy to change this to stop looping after the postfix key of your choice.
source here

Although it's not a solution, I managed to find a workaround by changing the scanner's suffix from carriage return to tab and using the original method I posted.

Related

How is w1_to_w2.count and w2_to_w1.count working

class Solution {
public:
vector<string> findAndReplacePattern(vector<string>& words, string p) {
vector<string> ans;
for(auto &w:words){
if(match(w,p))
ans.push_back(w);
}
return ans;
}
bool match(string &w1,string &w2){
unordered_map<char,char>w1_to_w2,w2_to_w1;
for( i=0;i<w1.size();i++)
if( (w1_to_w2.count(w1[i]) && w1_to_w2[w1[i]] != w2[i]) ||
(w2_to_w1.count(w2[i]) && w2_to_w1[w2[i]] != w1[i]) )
return false;
else
w1_to_w2[w1[i]] = w2[i],
w2_to_w1[w2[i]] = w1[i];
return true;
}
};
Input:
words = ["abc","deq","mee","aqq","dkd","ccc"]
pattern = "abb"
Output:
["mee","aqq"]
Explanation:
"mee" matches the pattern because there is a permutation {a -> m, b -> e, ...}. "ccc" does not match the pattern because {a -> c, b -> c, ...} is not a permutation, since a and b map to the same letter.
How is this working, especially count? Please explain step by step.
findAndReplacePattern(words, p) just calls match(word, p) for each word in words - if they match then word is appended to ans, which is later returned.
In match(w1, w2), w1_to_w2 and w2_to_w1 are initially empty, and i counts its way along both strings (which are assumed to be of the same length - often a dangerous assumption that can lead to programs that crash).
On the first iteration when i is 0, the .count(...) calls return 0 because there's nothing in the unordered_maps, so the function doesn't return. Instead if evaluates the else clause, which records that:
the first letter in w1 - w1[0] - corresponds to the first letter in w2, by doing the assignment w1_to_w2[w1[i]] = w2[i], and
the first letter in w2 - w2[0] - corresponds to the first letter in w1, by doing the assignment w2_to_w1[w2[i]] = w1[i].
As it makes its way along the rest of the letters in the words...
w1_to_w2.count(w1[i]) && w1_to_w2[w1[i]] != w2[i]
...is saying that if we've already seen the same letter as w1[i] before in w1 and the letter in w2 it corresponded to before (i.e. w1_to_w2[w1[i]]) is not the same letter it corresponds to now (i.e. w2[i]), that would be one reason to return false, or...
w2_to_w1.count(w2[i]) && w2_to_w1[w2[i]] != w1[i]
...if we've already seen the same letter as w2[i] before in w2 and the letter in w1 it corresponded to before (i.e. w2_to_w1[w2[i]]) is not the same letter it corresponds to now (i.e. w1[i]), that would be another reason to return false.

Backspace String Compare Leetcode Question

I have a question about the following problem on Leetcode:
Given two strings S and T, return if they are equal when both are typed into empty text editors. # means a backspace character.
Example 1:
Input: S = "ab#c", T = "ad#c"
Output: true
Explanation: Both S and T become "ac".
Example 2:
Input: S = "ab##", T = "c#d#"
Output: true
Explanation: Both S and T become "".
Example 3:
Input: S = "a##c", T = "#a#c"
Output: true
Explanation: Both S and T become "c".
Example 4:
Input: S = "a#c", T = "b"
Output: false
Explanation: S becomes "c" while T becomes "b".
Note:
1 <= S.length <= 200
1 <= T.length <= 200
S and T only contain lowercase letters and '#' characters.
Follow up:
Can you solve it in O(N) time and O(1) space?
My answer:
def backspace_compare(s, t)
if (s.match?(/[^#[a-z]]/) || t.match?(/[^#[a-z]]/)) || (s.length > 200 || t.length > 200)
return "fail"
else
rubular = /^[\#]+|([^\#](\g<1>)*[\#]+)/
if s.match?(/#/) && t.match?(/#/)
s.gsub(rubular, '') == t.gsub(rubular, '')
else
new_s = s.match?(/#/) ? s.gsub(rubular, '') : s
new_t = t.match?(/#/) ? t.gsub(rubular, '') : t
new_s == new_t
end
end
end
It works in the terminal and passes the given examples, but when I submit it on leetcode it tells me Time Limit Exceeded. I tried shortening it to:
rubular = /^[\#]+|([^\#](\g<1>)*[\#]+)/
new_s = s.match?(/#/) ? s.gsub(rubular, '') : s
new_t = t.match?(/#/) ? t.gsub(rubular, '') : t
new_s == new_t
But also the same error.
So far, I believe my code fulfills the O(n) time, because there are only two ternary operators, which overall is O(n). I'm making 3 assignments and one comparison, so I believe that fulfills the O(1) space complexity.
I have no clue how to proceed beyond this, been working on it for a good 2 hours..
Please point out if there are any mistakes in my code, and how I am able to fix it.
Thank you! :)
Keep in mind that with N <= 200, your problem is more likely to be linear coefficient, not algorithm complexity. O(N) space is immaterial for this; with only 400 chars total, space is not an issue. You have six regex matches, two of which are redundant. More important, regex is slow processing for such a specific application.
For speed, drop the regex stuff and do this one of the straightforward, brute-force ways: run through each string in order, applying the backspaces as appropriate. For instance, change both the backspace and the preceding letter to spaces. At the end of your checking, remove all the spaces in making a new string. Do this with both S and T; compare those for equality.
It may be easiest to start at the end of the string and work towards the beginning:
def process(str)
n = 0
str.reverse.each_char.with_object('') do |c,s|
if c == '#'
n += 1
else
n.zero? ? (s << c) : n -= 1
end
end.reverse
end
%w|ab#c ad#c ab## c#d# a##c #a#c a#c b|.each_slice(2) do |s1, s2|
puts "\"%s\" -> \"%s\", \"%s\" -> \"%s\" %s" %
[s1, process(s1), s2, process(s2), (process(s1) == process(s2)).to_s]
end
"ab#c" -> "ac", "ad#c" -> "ac" true
"ab##" -> "", "c#d#" -> "" true
"a##c" -> "c", "#a#c" -> "c" true
"a#c" -> "c", "b" -> "b" false
Let's look at a longer string.
require 'time'
alpha = ('a'..'z').to_a
#=> ["a", "b", "c",..., "z"]
s = (10**6).times.with_object('') { |_,s|
s << (rand < 0.4 ? '#' : alpha.sample) }
#=> "h####fn#fjn#hw###axm...#zv#f#bhqsgoem#glljo"
s.size
#=> 1000000
s.count('#')
#=> 398351
and see how long it takes to process.
require 'time'
start_time = Time.now
(u = process(s)).size
#=> 203301
puts (Time.now - start_time).round(2)
#=> 0.28 (seconds)
u #=> "ffewuawhfa...qsgoeglljo"
As u will be missing the 398351 pound signs in s, plus an almost equal number of other characters removed by the pound signs, we would expect u.size to be about:
10**6 - 2 * s.count('#')
#=> 203298
In fact, u.size #=> 203301, meaning that, at the end, 203301 - 203298 #=> 3 pound signs were unable to remove a character from s.
In fact, process can be simplified. I leave that as an exercise for the reader.
class Solution {
public boolean backspaceCompare(String s, String t) {
try {
Stack<Character> st1 = new Stack<>();
Stack<Character> st2 = new Stack<>();
st1 = convertToStack(s);
st2 = convertToStack(t);
if (st1.size() != st2.size()) {
return false;
} else {
int length = st1.size();
for (int i = 0; i < length; i++) {
if (st1.peek() != st2.peek())
return false;
else {
st1.pop();
st2.pop();
}
if (st1.isEmpty() && st2.isEmpty())
return true;
}
}
} catch (Exception e) {
System.out.print(e);
}
return true;
}
public Stack<Character> convertToStack(String s){
Stack<Character> st1 = new Stack<>();
for (int i = 0; i < s.length(); i++) {
if (s.charAt(i) != '#') {
st1.push(s.charAt(i));
} else if (st1.empty()) {
continue;
} else {
st1.pop();
}
}
return st1;
}
}

Match word by its prefix

I'm trying to match a string by its prefix that ends with a particular character. For example, if my string is "abcd" and ends in #, then any word which is a prefix of "abcd" should be matched as long as it ends with #. Here are some examples to help illustrate the pattern:
Input: "ab#" gives true (as "ab" is a prefix of "abcd" and end with a #).
Input: "abcd#" gives true (as "abcd" is a prefix of "abcd" and end with a #).
Input: "bc#" gives false (as "bc" is a not a prefix of "abcd" ).
Input: "ab#" gives false (while "ab" is a prefix of "abcd", it doesn't end with #) .
Input: "ac#" gives false (while "ac" is contained within "abcd", it doesn't begin with a prefix from "abcd") .
So far, I've managed to come up with the following expression which seems to be working fine:
/(abcd|abc|ab|a)#/
While this is working, it isn't very practical, as larger words of length n will make the expression quite large:
/(n|n-1|n-2| ... |1)#/
Is there a way to rewrite this expression so it is more scalable and concise?
Example of my attempt (in JS):
const regex = /(abcd|abc|ab|a)#/;
console.log(regex.test("abcd#")); // true
console.log(regex.test("ab#")); // true
console.log(regex.test("abc#")); // true
console.log(regex.test("abz#")); // false
console.log(regex.test("abc#")); // false
Edit: Some of the solutions provided are nice and do do what I'm after, however, for this particular question, I'm after a solution which uses pure regular expressions to match the prefix.
Just use String#startsWith and String#endsWith here:
String input = "abcd";
String prefix = "ab#";
if (input.startsWith(prefix.replaceAll("#$", "")) && prefix.endsWith("#")) {
System.out.println("MATCH");
}
else {
System.out.println("NO MATCH");
}
Edit: A JavaScript version of the above:
var input = "abcd";
var prefix = "ab#";
if (input.startsWith(prefix.replace(/#$/, "")) && prefix.endsWith("#")) {
console.log("MATCH");
}
else {
console.log("NO MATCH");
}
Try ^ab?c?d?#$
Explanation:
`^` - match beginning of a string
`b?` - match match zero or one `b`
Rest is analigocal to the above.
Demo
Here's a left field JavaScript option. Build and array of valid prefixes, use join on the array to make your regex pattern.
var validPrefixes = ["abcd",
"abc",
"ab",
"a",
"areallylongprefix"];
var regexp = new RegExp("^(" + validPrefixes.join("|") + ")#$");
console.log(regexp.test("abcd#"));// true
console.log(regexp.test("ab#")); // true
console.log(regexp.test("abc#")); // true
console.log(regexp.test("abz#")); // false
console.log(regexp.test("abc#")); // false
console.log(regexp.test("areallylongprefix#")); //true
This can be adapted to the language of tour choosing, also handy if your prefixes are dynamically retrieved from a database or similar.
Here's my c# attempt:
private static bool test(string v)
{
var pattern = "abcd#";
//No error handling
return v.EndsWith(pattern[pattern.Length-1])
&& pattern.Replace("#", "").StartsWith(v.Replace("#",""));
}
Console.WriteLine(test("abcd#")); // true
Console.WriteLine(test("ab#")); // true
Console.WriteLine(test("abc#")); // true
Console.WriteLine(test("abz#")); // false
Console.WriteLine(test("abc#")); // false
Console.WriteLine(test("abc")); //false
/a(b(cd?)?)?#/
Or for a longer example, to match a prefix of "abcdefg#":
/a(b(c(d(e(fg?)?)?)?)?)?#/
Generating this regex isn't completely trivial, but some options are:
function createPrefixRegex(s) {
// This method creates an unnecessary set of parentheses
// around the last letter, but that won't harm anything.
return new RegExp(s.slice(0,-1).split('').join('(') + ')?'.repeat(s.length - 2) + '#');
}
function createPrefixRegex2(s) {
var r = s[0];
for (var i = 1; i < s.length - 2; ++i) {
r += '(' + s[i];
}
r += s[s.length - 2] + '?' + ')?'.repeat(s.length - 3) + '#';
return new RegExp(r);
}
function createPrefixRegex3(s) {
var recurse = function(i) {
if (i >= s.length - 1) {
return '';
}
if (i === s.length - 2) {
return s[i] + '?';
}
return '(' + s[i] + recurse(i + 1) + ')?';
}
return new RegExp(s[0] + recurse(1) + '#');
}
These may fail if the input string has no prefix before the '#' character, and they assume the last character in the string is '#'.

What is wrong with this StringReplace code?

I have this string XXX:ABC. I want to remove XXX: so that the string becomes ABC .
The variable Symbol contains the string XXX:ABC .
The code as follows:
MsgBox, Symbol %Symbol%
SearchText := "XXX:"
ReplaceText := ""
StringReplace, newSymbol, Symbol, SearchText, ReplaceText, ALL
MsgBox, newSymbol %newSymbol%
From the message box output, newSymbol content is the same as Symbol. Can someone tell me what is wrong with my code?
I am using Autohotkey v1.1.14.03.
For command parameters, you have to distinguish between variable parameters and value parameters.
StringReplace for instance has the following argument list:
StringReplace, OutputVar, InputVar, SearchText [, ReplaceText,
ReplaceAll?]
The docs say furthermore:
OutputVar: The name of the variable in which to store the result
of the replacement process.
InputVar: The name of the variable whose contents will be read
from.
SearchText: The string to search for.
As you can see, some parameters are expected to be variable names, whereas others are expected to be values like strings or numbers. You can use variable contents as value parameters by either enclosing them in percent signs or using them within an expression:
StringReplace, newSymbol, Symbol, %SearchText%, %ReplaceText%, ALL
; or as an expression
StringReplace, newSymbol, Symbol, % SearchText, % ReplaceText, ALL
With the newer StrReplace() function I noticed I could not use it with variables for whatever reason. And the documentation here: https://autohotkey.com/docs/commands/StringReplace.htm
is lacking an example. After a lot of tests, couldn't figure it out.
So I wrote a "polyfill" for StrReplace, complete with test code.
; Author: John Mark Isaac Madison
; EMAIL : J4M4I5M7#hotmail.com
; I_SRC : input source text
; I_OLD : old token to find
; I_NEW : new token to replace old with
FN_POLYFILL_STR_REPLACE(I_SRC, I_OLD, I_NEW)
{
;Check length of input parameters:
;--------------------------------------------;
L1 := StrLen(I_SRC)
L2 := StrLen(I_OLD)
L3 := StrLen(I_NEW)
if( !(L1 > 0) )
{
msgbox BAD_PARAM_#1:STR_REP
}
if( !(L2 > 0) )
{
msgbox BAD_PARAM_#2:STR_REP
}
if( !(L3 > 0) )
{
msgbox BAD_PARAM_#3:STR_REP
}
;--------------------------------------------;
OP := "" ;output string
f_ptr := 0 ;fill pointer
max_i := StrLen(I_SRC)
dx := 0 ;<--Loop counter / index
LOOP ;;[LOOP_START];;
{
dx++
if(dx > max_i)
{
break ;[BAIL_OUT_OF_LOOP]
}
h := FN_IS_TOKEN_HERE(I_SRC, I_OLD, dx)
;if(8==dx)
;{
; msgbox, HACK_8 dx[%dx%] h[%h%] I_SRC[%I_SRC%] I_OLD[%I_OLD%]
; src_len := StrLen( I_SRC )
; old_len := StrLen( I_OLD )
; msgbox src_len [%src_len%] old_len[%old_len%] I_OLD[%I_OLD%]
;}
if( h > 0)
{
;token found, replace it by concating
;the I_NEW onto output string:
OP := OP . I_NEW
;OP := OP . "[X]"
;msgbox : I_SRC[%I_SRC%] h[%h%] dx[%dx%]
;jump pointer to last character of
;the found token to skip over
;now irrelevant characters:
dx := h
;msgbox, DX: %dx%
}
else
if( 0 == h)
{
msgbox, "H_SHOULD_NOT_BE_ZERO"
}
else
if( h < 0 )
{
;concat character to output:
c := SubStr(I_SRC,dx,1)
OP := OP . c
}
} ;;[LOOP_END];;
msgbox OP : %OP%
;msgbox I_SRC[ %I_SRC%] I_OLD[ %I_OLD%] I_NEW[ %I_NEW%]
return OP ;;<--return output string
}
;Author: John Mark Isaac Madison
;EMAIL : J4M4I5M7#hotmail.com
;unit-test that will run when script boots up:
FN_POLYFILL_STR_REPLACE_TEST()
{
T1 := FN_POLYFILL_STR_REPLACE("WHAT_IS_UP","UP","DOWN")
;;msgbox, T1 : %T1%
i_src := "123_TOKEN_123"
i_old := "TOKEN"
i_new := "NEEEW"
T2 := FN_POLYFILL_STR_REPLACE(i_src,i_old,i_new)
;;msgbox, T2 : %T2%
;;msgbox, "POLYFILL_TEST_RAN"
i_src := "const IS_VARNAME"
i_old := "VARNAME"
i_new := "BASH"
T3 := FN_POLYFILL_STR_REPLACE(i_src,i_old,i_new)
;msgbox, T3 : %T3%
i_src := "123456VARNAME"
i_old := "VARNAME"
i_new := "AB"
T4 := FN_POLYFILL_STR_REPLACE(i_src,i_old,i_new)
if(T1 != "WHAT_IS_DOWN")
{
msgbox [PSR_TEST_FAIL#1]
}
if(T2 != "123_NEEEW_123")
{
msgbox [PSR_TEST_FAIL#2]
}
if(T3 != "const IS_BASH")
{
msgbox [PSR_TEST_FAIL#3]
}
if(T4 != "123456AB")
{
msgbox [PSR_TEST_FAIL#4]
}
return ;rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr;
}
FN_POLYFILL_STR_REPLACE_TEST()
Also, be aware that trimming out newlines in your data is harder than it
should be as well. And will be bound to throw a monkey wrench into whatever string parsing you are doing.
Problem still exists in newer versions.
solve by this alternative syntax for expressions:
newSymbol := StrReplace(Symbol, "" . SearchText, "" . ReplaceText)

token meaning dependent on context

I have a weird string syntax where the meaning of a
delimiter depends on context. In the following sample
input:
( (foo) (bar) )
the result is a list of two strings ["foo"; "bar"].
The outer pair of parenthesis enters list mode.
Then, the next pair of parentheses delimits the string.
Inside strings, balanced pairs of parentheses are to be
treated as part of the string.
Right now the lexer decides what to return depending
on a global variable inside.
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
The delimiters are parentheses. If the lexer hits an
opening parenthesis, then
if inside is false, it emits an
Enter token and inside is set to true.
If inside is true, it switches to a string lexer
which treats any properly nested pair of parentheses
as part of the string. If the nesting level returns to
zero, the string buffer is passed to the parser.
If a closing parenthesis is encountered outside a string,
a Leave token is emitted and inside is unset.
My question is: How do I rewrite the lexer without
the global variable inside?
Fwiw I use menhir but afaict the same would be true for
ocamlyacc.
(Sorry if this sounds confused, I’m really a newbie to
the yacc/lex approach.
I can express all the above without thinking as a PEG but I
haven’t got used to mentally keeping lexer and parser
separated.
Feel free to point out other issues with the code!)
Simple example: *sample_lexer.mll*
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
let lpar = "("
let rpar = ")"
let ws = [' ' '\t' '\n' '\r']
rule tokenize = parse
| ws { tokenize lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
and string_scanner init depth buf = parse
| rpar { if depth = 0 then begin
Buffer.contents buf;
end else begin
Buffer.add_char buf ')';
string_scanner init (depth - 1) buf lexbuf end }
| lpar { Buffer.add_char buf '(';
string_scanner init (depth + 1) buf lexbuf }
| eof { raise (Error (Printf.sprintf
"Unexpected end of file inside string, pos %d--%d]!\n"
init
(Lexing.lexeme_start lexbuf))) }
| _ as chr { Buffer.add_char buf chr;
string_scanner init depth buf lexbuf }
*sample_scanner.mly*:
%token <string> String
%token Enter
%token Leave
%start <string list> process
%%
process:
| Enter lst = string_list Leave { lst }
string_list:
| elm = element lst = string_list { elm :: lst }
| elm = element { [elm] }
element:
| str = String { str }
main.ml:
open Batteries
let sample_input = "( (foo (bar) baz) (xyzzy) )"
(* EibssssssssssssseibssssseiL
* where E := enter inner
* L := leave inner
* i := ignore (whitespace)
* b := begin string
* e := end string
* s := part of string
*
* desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
*)
let main () =
let buf = Lexing.from_string sample_input in
try
List.print
String.print stdout
(Sample_parser.process Sample_lexer.tokenize buf);
print_string "\n";
with
| Sample_lexer.Error msg -> Printf.eprintf "%s%!" msg
| Sample_parser.Error -> Printf.eprintf
"Invalid syntax at pos %d.\n%!"
(Lexing.lexeme_start buf)
let _ = main ()
You can pass the state as an argument to tokenize. It still has to be mutable, but not global.
rule tokenize inside = parse
| ws { tokenize inside lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
And you call the parser as follows:
Sample_parser.process (Sample_lexer.tokenize (ref false)) buf