Find and change cyrillic word with boundary in google scripts - regex

The problem is that \b doesn't work with Russian and Ukrainian letters.
Here I try to find all matches of a word 'февраля' it the text, change them to tempword, then make it a link and change it back to 'февраля'.
function addLinks(word, siteurl) {
var id = 'doc\'s ID';
var doc = DocumentApp.openById(id);
var body = doc.getBody();
var tempword = 'ASDFDSGDDKDSL2';
var searchText = "\\b"+word+"\\b";
var element = body.findText(searchText);
console.log(element);
while (element) {
var start = element.getStartOffset();
var text = element.getElement().asText();
text.replaceText(searchText, tempword);
text.setLinkUrl(start, start + tempword.length - 1, siteurl);
element = body.findText(searchText);
}
body.replaceText(tempword, word);
}
addLinks('февраля', 'example.com');
It works as it should, if I change Russian word 'февраля' to English 'february'.
addLinks('february', 'example.com');
I need regular expression, because if I just look for 'февраля' script will apply it to other words like 'февралям', 'февралями' etc.
So, it is a question, how to make it work.
Mistake "Exception: Invalid regular expression pattern" occurs with this code:
var searchText = "(?<=[\\s,.:;\"']|^)"+word+"(?=[\\s,.:;\"']|$)";
or this:
var searchText = "(^|\s)"+word+"(?=\s|$)";
and some other.

Here is my solution:
function main() {
addLinks('февраля', 'example.com');
}
function addLinks(word, url) {
var doc = DocumentApp.getActiveDocument();
var pgfs = doc.getParagraphs();
var bound = '[^А-яЁё]'; // any letter except Russian one
var patterns = [
{regex: bound + word + bound, start: 1, end: 1}, // word inside of line
{regex: '^' + word + bound, start: 0, end: 1}, // word at the start
{regex: bound + word + '$', start: 1, end: 0}, // word at the end
{regex: '^' + word + '$', start: 0, end: 0} // word = line
];
for (var pgf of pgfs) for (var pattern of patterns) {
var location = pgf.findText(pattern.regex);
while (location) {
var start = location.getStartOffset() + pattern.start;
var end = location.getEndOffsetInclusive() - pattern.end;
pgf.editAsText().setLinkUrl(start, end, url);
location = pgf.findText(pattern.regex, location);
}
}
}
Test output:
It handles well the word placed at the start or at the end of the line (or both). And it gives no the weird error message.

Related

How to get integer with regular expression in kotlin?

ViewModel
fun changeQty(textField: TextFieldValue) {
val temp1 = textField.text
Timber.d("textField: $temp1")
val temp2 = temp1.replace("[^\\d]".toRegex(), "")
Timber.d("temp2: $temp2")
_qty.value = textField.copy(temp2)
}
TextField
OutlinedTextField(
modifier = Modifier
.focusRequester(focusRequester = focusRequester)
.onFocusChanged {
if (it.isFocused) {
keyboardController?.show()
}
},
value = qty.copy(
text = qty.text.trim()
),
onValueChange = changeQty,
label = { Text(text = qtyHint) },
singleLine = true,
keyboardOptions = KeyboardOptions(
keyboardType = KeyboardType.Number,
imeAction = ImeAction.Done
),
keyboardActions = KeyboardActions(
onDone = {
save()
onDismiss()
}
)
)
Set KeyboardType.Number, it display 1,2,3,4,5,6,7,8,9 and , . - space.
I just want to get integer like -10 or 10 or 0.
But I type the , or . or -(not the front sign), it show as it is.
ex)
typing = -10---------
hope = -10
display = -10---------
I put regular expression in
val temp2 = temp1.replace("[^\\d]".toRegex(), "")
But, it doesn't seem to work.
How I can get only integer(also negative integer)?
Use this regex (?<=(\d|-))(\D+) to replace all non digit characters, except first -.
fun getIntegersFromString(input: String): String {
val pattern = Regex("(?<=(\\d|-))(\\D+)")
val formatted = pattern.replace(input, "")
return formatted
}
Check it here

Regular expression to extract Words inside nested parentheses

im looking for the regexp that make able to do this tasks
message Body Input: Test1 (Test2) (test3) (ti,ab(text(text here(possible text)text(possible text(more text))))) end (text)
the result that i want Result: (text(text here(possible text)text(possible text(more text))))
I want to collect everything that is inside ti,ab(................)
var messageBody = message.getPlainBody()
var ssFile = DriveApp.getFileById(id);
DriveApp.getFolderById(folder.getId()).addFile(ssFile);
var ss = SpreadsheetApp.open(ssFile);
var sheet = ss.getSheets()[0];
sheet.insertColumnAfter(sheet.getLastColumn());
SpreadsheetApp.flush();
var sheet = ss.getSheets()[0];
var range = sheet.getRange(1, 1, sheet.getLastRow(), sheet.getLastColumn() + 1)
var values = range.getValues();
values[0][sheet.getLastColumn()] = "Search Strategy";
for (var i = 1; i < values.length; i++) {
//here my Regexp
var y = messageBody.match(/\((ti,ab.*)\)/ig);
if (y);
values[i][values[i].length - 1] = y.toString();
range.setValues(values);
The only solution you may use here is to extract all substrings inside parentheses and then filter them to get all those that start with ti,ab:
var a = [], r = [], result;
var txt = "Test1 (Test2) (test3) (ti,ab(text(text here(possible text)text(possible text(more text))))) end (text)";
for(var i=0; i < txt.length; i++){
if(txt.charAt(i) == '(') {
a.push(i);
}
if(txt.charAt(i) == ')') {
r.push(txt.substring(a.pop()+1,i));
}
}
result = r.filter(function(x) { return /^ti,ab\(/.test(x); })
.map(function(y) {return y.substring(6,y.length-1);})
console.log(result);
The nested parentheses function is borrowed from Nested parentheses get string one by one. The /^ti,ab\(/ regex matches ti,ab( at the start of the string.
The above solution allows extracting nested parentheses inside nested parentheses. If you do not need it, use
var txt = "Test1 (Test2) ((ti,ab(text(text here))) AND ab(test3) Near Ti(test4) NOT ti,ab,su(test5) NOT su(Test6))";
var start=0, r = [], level=0;
for (var j = 0; j < txt.length; j++) {
if (txt.charAt(j) == '(') {
if (level === 0) start=j;
++level;
}
if (txt.charAt(j) == ')') {
if (level > 0) {
--level;
}
if (level === 0) {
r.push(txt.substring(start, j+1));
}
}
}
console.log("r: ", r);
var rx = "\\b(?:ti|ab|su)(?:,(ti|ab|su))*\\(";
var result = r.filter(function(y) { return new RegExp(rx, "i").test(y); })
.map(function(x) {
return x.replace(new RegExp(rx, "ig"), '(')
});
console.log("Result:",result);
The pattern used to filter and remove the unnecessary words
\b(?:ti|ab|su)(?:,(ti|ab|su))*\(
Details
\b - a word boundary
(?:ti|ab|su) - 1 of the alternatives,
(?:,(ti|ab|su))* - 0 or more repetitions of , followed with 1 of the 3 alternatives
\( - a (.
The match is replaced with ( to restore it in the match.

Mongodb Regex Query first 2 characters of the string

In one of my mongodb collection, I have a date string that has a mm/dd/yyyy format. Now, I want to query the 'mm' string.
Example, 05/20/2016 and 04/05/2015.
I want to get the first 2 characters of the string and query '05'. With that, the result I will get should only be 05/20/2016.
How can I achieve this?
Thanks!
For a regex solution, the following will suffice
var search = "05",
rgx = new RegExp("^"+search); // equivalent to var rgx = /^05/;
db.collection.find({ "leave_start": rgx });
Testing
var leave_start = "05/06/2016",
test = leave_start.match(/^05/);
console.log(test); // ["05", index: 0, input: "05/06/2016"]
console.log(test[0]); // "05"
or
var search = "05",
rgx = new RegExp("^"+search),
leave_start = "05/12/2016";
var test = leave_start.match(rgx);
console.log(test); // ["05", index: 0, input: "05/06/2016"]
console.log(test[0]); // "05"
Another alternative is to use the aggregation framework and take advantage of the $substr operator to extract the first 2 characters of a field and then the $match operator will filter documents based on the new substring field above:
db.collection.aggregate([
{
"$project": {
"leaves_start": 1,
"monthSubstring": { "$substr": : [ "$leaves_start", 0, 2 ] }
}
},
{ "$match": { "monthSubstring": "05" } }
])

Replacement matching regex with anchor tag?

I have a problem when using Regex. I have a html document which create an anchor link when it matches condition.
An example html:
Căn cứ Luật Tổ chức HĐND và UBND ngày 26/11/2003;
Căn cứ Nghị định số 63/2010/NĐ-CP ngày 08/6/2010 của Chính phủ về
kiểm soát thủ tục hành chính;
Căn cứ Quyết định số 165/2011/QĐ-UBND ngày 06/5/2011 của UBND tỉnh
ban hành Quy định kiểm soát thủ tục hành chính trên địa bàn tỉnh;
Căn cứ Quyết định số 278/2011/QĐ-UBND ngày 02/8/2011 của UBND tỉnh
ban hành Quy chế phối hợp thực hiện thống kê, công bố, công khai thủ
tục hành chính và tiếp nhận, xử lý phản ánh, kiến nghị của cá nhân, tổ
chức về quy định hành chính trên địa bàn tỉnh;
Xét đề nghị của Giám đốc Sở Công Thương tại Tờ trình số
304/TTr-SCT ngày 29 tháng 5 năm 2013
I want to match these bold texts and make anchor links from these. If it has, try ignore. Link example 63/2010/NĐ-CP
var matchLegals = new Regex(#"(?:[\d]+\/?)\d+\/[a-z\dA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ\-]+", RegexOptions.Compiled);
var doc = new HtmlDocument();
doc.LoadHtml(htmlString);
var allElements = doc.DocumentNode.SelectSingleNode("//div[#class='main-content']").Descendants();
foreach (var node in allElements)
{
var matches = matchLegals.Matches(node.InnerHtml);
foreach (Match m in matches)
{
var k = m.Value;
//dont know what to do
}
}
What can i do this
Many thanks.
I assume your regex pattern is OK and works. Another assumption is that node.InnerHtml doesn't contain any <a> tags already encompassing any of the potential matches.
In this case, it's as simple as doing something like this:
node.InnerHtml = Regex.Replace(node.InnerHtml, "[your pattern here]", "<a href='query=$&'>$&</a>");
...
doc.Save("output.html");
Note, that you may need to work on the href component - I'm unsure how your link should be built.
you match text and replace:
<script>
var s = '...';
var matchs = s.match(/\d{2,3}\/\d{4}\/[a-zA-Z\-áàảãạăâắằấầặẵẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịđùúủũụưứửữựÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼÊỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỨỪỬỮỰỲỴÝỶỸửữựỵỷỹ]+/gi);
if (matchs != null) {
for(var i=0; i<matchs.length;i++){
var val = matchs[i];
s = s.replace(val, '<a href="?key=' + val + '"/>' + val + '</a>');
}
}
document.write(s);
</script>
#Shaamaan thank for your advice. After few hours of coding, it works now
var content = doc.DocumentNode.SelectSingleNode("//div[#class='main-content']");
var items = content.SelectNodes(".//text()[normalize-space(.) != '']");
foreach (HtmlNode node in items)
{
if (!matchLegals.IsMatch(node.InnerText) || node.ParentNode.Name == "a")
{
continue;
}
var texts = node.InnerHtml.Trim();
node.InnerHtml = matchLegals.Replace(texts, a => string.Format("<a href='/search?q={0}'>{0}</a>",a.Value));
}

How to find which group is matched in NSRegularExpression

I have a regex statement with multiple capture groups which are separated by | operator. How can I find out which capture group is matched? Only way I can think of -for this example- is counting the number of characters if something is matched.
var string = "1234567897"
var pattern = "(^\\d{9}$)|(^\\d{10}$)|(^\\d{13}$)|(^[a-zA-Z]{2}\\d{9}[a-zA-Z]{2}$)"
var myRegex = NSRegularExpression(pattern: pattern, options: nil, error: nil)!
if let myMatch = myRegex.firstMatchInString(string, options: nil,
range: NSRange(location: 0, length: string.utf16Count)) {
println((string as NSString).substringWithRange(myMatch.rangeAtIndex(0)))
}
I wrote a code which worked for my example. I am sure it can be written better way but it works for now.
Swift 2.3
var string = "123456789"
var pattern = "(^\\d{9}$)|(^\\d{10}$)|(^\\d{13}$)|(^[a-zA-Z]{2}\\d{9}[wW]{2}$)"
var myRegex = try! NSRegularExpression(pattern: pattern, options: [])
if let myMatch = myRegex.firstMatchInString(string, options: NSMatchingOptions.init(rawValue: 0), range: NSRange(location: 0, length: string.utf16.count)) {
var matchedGroup = 0
for var i in 1..<myMatch.numberOfRanges {
if myMatch.rangeAtIndex(i).length != 0 {
matchedGroup = i
break
}
}
print(matchedGroup)
print((string as NSString).substringWithRange(myMatch.rangeAtIndex(0))) //whatever the range you want to print
}
Swift 3
var string = "123456789"
var pattern = "(^\\d{9}$)|(^\\d{10}$)|(^\\d{13}$)|(^[a-zA-Z]{2}\\d{9}[wW]{2}$)"
var myRegex = try! NSRegularExpression(pattern: pattern, options: [])
if let myMatch = myRegex.firstMatch(in: string, options: NSRegularExpression.MatchingOptions.init(rawValue: 0), range: NSRange(location: 0, length: string.utf16.count)) {
var matchedGroup = 0
for var i in 1..<myMatch.numberOfRanges {
if myMatch.rangeAt(i).length != 0 {
matchedGroup = i
break
}
}
print(matchedGroup)
print((string as NSString).substring(with: myMatch.rangeAt(0))) //whatever the range you want to print
}