Regex which matches anchor tags wrapping an url - regex

I have following text:
https://google.com
website
<em>https://google.com</em>
which I want to transform into:
https://google.com
website
<em>https://google.com</em>
by replacing anchor tags which contain urls with just the url.
i came this far: <a.*?href="http.*?>(.*?)<\/a> but struggle making the group more strict. it should check for the http string and allow wrapping tags such as <em>.
any help is appreciated, thanks!

I came up with:
// your code goes here
var s =
'https://google.com\n' +
' website \n' +
'website\n' +
'<em>https://google.com</em>\n' +
' <em>https://google.com</em> \n' +
'<a href="https://www.google.com">\n' +
' <em>https://www.google.com</em>\n' +
'</a>\n';
var re = /<a\s+href="([^"]+)"\s*>\s*(.+?)\s*<\/a>/isg;
var new_s = s.replace(re, function(match, p1, p2) {
if (p2.indexOf('http') == -1)
return match; /* in effect, no substritution */
return p2;
});
console.log(new_s);
See demo

You can try using DOMParser
let str = `https://google.com
website
<em>https://google.com</em>`
let html = new DOMParser()
let parsed = html.parseFromString(str, 'text/html')
let final = [...parsed.getElementsByTagName('a')].map(tag=>{
let href = tag.href
if(tag.innerHTML.includes(tag.href.replace(/\/$/,''))){
return tag.innerHTML
}
return tag
})
console.log(final)

Related

Flutter Dart: RegEx to extract URLs from a String

This is my string:
window.urlVideo = 'https://node34.vidstreamcdn.com/hls/5d59908aea5aa101a054dec2a1cd3aff/5d59908aea5aa101a054dec2a1cd3aff.playlist.m3u8';
var playerInstance = jwplayer("myVideo");
var countplayer = 1;
var countcheck = 0;
playerInstance.setup({
sources: [{
"file": urlVideo
}],
tracks: [{
file: "https://cache.cdnfile.info/images/13f9ddcaf2d83d846056ec44b0f1366d/12.vtt",
kind: "thumbnails"
}],
image: "https://cache.cdnfile.info/images/13f9ddcaf2d83d846056ec44b0f1366d/12_cover.jpg",
});
function changeLink() {
window.location = "//vidstreaming.io/load.php?id=MTM0OTgz&title=Mairimashita%21+Iruma-kun+Episode+12";
}
window.shouldChangeLink = function () {
window.location = "//vidstreaming.io/load.php?id=MTM0OTgz&title=Mairimashita%21+Iruma-kun+Episode+12";
}
I am using flutter dart.
How can I get window.urlVideo URL link and image URL link and .vtt file link?
Or
How can I get a list of URLs from a String?
I tried finding a way with and without using RegEx but I couldn't.
Any help is apreciated
This may not be the complete regex, but this worked for me for randomly picked links:
void main() {
final text = """My website url: https://blasanka.github.io/
Google search using: www.google.com, social media is facebook.com, http://example.com/method?param=flutter
stackoverflow.com is my greatest website. DartPad share: https://github.com/dart-lang/dart-pad/wiki/Sharing-Guide see this example and edit it here https://dartpad.dev/3d547fa15849f9794b7dbb8627499b00""";
RegExp exp = new RegExp(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+');
Iterable<RegExpMatch> matches = exp.allMatches(text);
matches.forEach((match) {
print(text.substring(match.start, match.end));
});
}
Result:
https://blasanka.github.io/
www.google.com
facebook.com
http://example.com/method?param=flutter
stackoverflow.com
https://github.com/dart-lang/dart-pad/wiki/Sharing-Guide
https://dartpad.dev/3d547fa15849f9794b7dbb8627499b00
Play with it here: https://dartpad.dev/3d547fa15849f9794b7dbb8627499b00
Try this,
final urlRegExp = new RegExp(
r"((https?:www\.)|(https?:\/\/)|(www\.))[-a-zA-Z0-9#:%._\+~#=]{1,256}\.[a-zA-Z0-9]{1,6}(\/[-a-zA-Z0-9()#:%_\+.~#?&\/=]*)?");
final urlMatches = urlRegExp.allMatches(text);
List<String> urls = urlMatches.map(
(urlMatch) => text.substring(urlMatch.start, urlMatch.end))
.toList();
urls.forEach((x) => print(x));
Getting just the https? and ftp url's that are in quotes is this :
r"([\"'])\s*((?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?#)?(?:(?:(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-zA-Z0-9\u00a1-\uffff]+-?)*[a-zA-Z0-9\u00a1-\uffff]+)(?:\.(?:[a-zA-Z0-9\u00a1-\uffff]+-?)*[a-zA-Z0-9\u00a1-\uffff]+)*(?:\.(?:[a-zA-Z\u00a1-\uffff]{2,})))|localhost)(?::\d{2,5})?(?:\/(?:(?!\1|\s)[\S\s])*)?)\s*\1"
Where the Url is captured in group 2.
https://regex101.com/r/UPmLBl/1
Much safer to use a library like linkify instead of rolling your own regex.
/// Attempts to extract link from a string.
///
/// If no link is found, then return null.
String extractLink(String input) {
var elements = linkify(input,
options: LinkifyOptions(
humanize: false,
));
for (var e in elements) {
if (e is LinkableElement) {
return e.url;
}
}
return null;
}

How to get the KeyValue from QueryString and return string in specific string format

I am a newbie and need help.
I need to create a baseString from the below QueryString.
This baseString will look something like this in the end:
&ap=裕廊坊 心邻坊&oq=c# nunit mac&q=c# nunit mac
QueryString :
HTTPS://www.sky.com/api/v1/rest/level2/in-in/?q=c%23+nunit+mac&oq=c%23+nunit+mac&ap=裕坊%20邻坊
Problem:
How to get the KeyValue from the above QueryString
1) By getting all the components separated by "&" like below
--Keyvalue from the Query String:
q=c%23+nunit+mac&oq
oq=c%23+nunit+mac
ap=裕坊%20邻坊
I would use struct as I need to call the static func:
struct BaseString {
static func createBaseString(authPrefix,signMethod,urlPath,nonce, timestamp,delimeter="&", bool sort= true, bool quote = false) -> String? {
var dict = [String:String]()
let url = NSURL(string: urlPath)
var keyValues = url.query?.componentsSeparatedByString("&")
//-(1)- adding keyValue into Dictinary
dict.??
//-- how to add the data below?
//- after (1) : Add other key value into same Dictionary
dict Add(authPrefix + "_timestamp", timestamp);
dict.Add(authPrefix + "_nonce", nonce);
dict.Add(authPrefix + "_signature_method", signMethod);
dict.Add(authPrefix + "_version", "1.0");
var return_format:String
if quote == true{
//-- create a baseString
sort the Dictionary
return_format = "&" + url + "&" +Dictionary.ToString()
(format: String = "q ="V1" " the value with double quote)
}else{
//-- create a baseString
sort the Dictionary
return_format = Dictionary.ToString()
(format:Strig = " q=v2")
}
var baseString = return_format
return baseString
}
}
Thanks. your help is much appreciated.
You can get Key-Value dictionary from your URL's query items with the help of URLQueryItem class, like this
let urlString = "https://www.sky.com/api/v1/rest/level2/in-in/?q=c%23+nunit+mac&oq=c%23+nunit+mac&ap=裕坊%20邻坊"
let encodedUrlString = urlString.addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed)!
let items = URLComponents(string: encodedUrlString)?.queryItems
var keyValues: [String: String] = [:]
items?.forEach{
item in
keyValues[item.name] = item.value
}
print(keyValues)
//["q": "c%23+nunit+mac", "ap": "裕坊%20邻坊", "oq": "c%23+nunit+mac"]
Hope this will help you.
If you need to obtain Query substring from your url string, you need to create URL from it and get query.
guard let url = URL(string: encodedUrlString) else {
fatalError()
}
let queryString = url.query!
print(queryString.removingPercentEncoding)
//q=c%23+nunit+mac&oq=c%23+nunit+mac&ap=裕坊%20邻坊
If you need to add new components to your query,
var components = URLComponents(string: encodedUrlString)
let item = URLQueryItem(name: "NEWVKEY", value: "NEWVALUE")
components?.queryItems?.append(item)
let url = components?.url
let resultString = url?.absoluteString
//or
let resultString2 = url?.absoluteString.removingPercentEncoding
The idea is to use URL processing abilities of Swift standard library. Please check the documentation of URL, URLComponents, URLQueryItem structs. Don't write string processing code, manipulate URLs instead.
https://developer.apple.com/documentation/foundation/url
https://developer.apple.com/documentation/foundation/urlcomponents
https://developer.apple.com/documentation/foundation/urlqueryitem

Reg-ex query is too greedy

Consider the following snippet:
<Offering id=1 blah blah templateid=abc something=blah
gretre
rtert
ret
tr
/Offering>
<Offering id=2 blah blah templateid=def something=blah>
gretre
rtert
ret
tr
</Offering>
<Offering id=3 blah blah templateid=ghi something=blah>
gretre
rtert
ret
tr
</Offering>
Given that all I know is the template id, I need to return the whole Offering node that contains it. i.e. for templateid=def, I need to return:
<Offering id=2 blah blah templateid=def something=blah>
gretre
rtert
ret
tr
</Offering>
I've tried all sorts but the closest I can get is something along the lines of (?s)<Offering.+?templateid=def.+?</Offering> which returns from the first offering until the end of the offering containing my template id. I understand why but nothing I've tried can fix it. I'm guessing lookarounds but I just can't get it right.
How can I return the whole offering node?
You could modify your regex using negation and I would probably use a word boundary as well.
<Offering[^>]*\btemplateid=def[^>]*>[^<]*</Offering>
If you have other tags inside of this tag, you could do ...
(?s)<Offering[^>]*\btemplateid=def.+?</Offering>
This should work but please notice that I escaped the / character, and you may not need to do that depending on what language you're using:
(<Offering[^>]* templateid=ghi [^>]*>[^<]*<\/Offering>)
As you say you "need to return the whole Offering node", the arguably simpler, safer and more readable way would be a DOM parser. I've included examples of how you might do this in JavaScript and PHP below.
PHP
$doc = new DOMDocument();
#$doc->loadHTML($testStr); //Only needed if you're loading HTML like in the example which has repeated attributes and other things that could cause errors
$body = $doc->getElementsByTagName('body')->item(0);
$templateID = 'def';
$myNode = null;
foreach($body->childNodes as $node)
{
if($node->nodeName=='offering')
{
if($node->attributes->getNamedItem('templateid')->nodeValue == $templateID)
{
$myNode = $node;
}
}
}
//$id = $myNode->attributes->getNamedItem('id')->nodeValue;
//$html = $doc->saveHTML($myNode)
JavaScript
var testStr = document.getElementById('str_container').innerHTML;
var parser = new DOMParser();
var doc = parser.parseFromString(testStr,'text/html');
var templateID = 'def';
var myEl = null;
for(var i=0,c=doc.body.children.length;i<c;i++)
{
if(doc.body.children[i].getAttribute('templateid')===templateID)
{
myEl = doc.body.children[i];
}
}
//var id = myEl.id;
//var html = myEl.outerHTML;
console.log(myEl || 'not found');
JavaScript >= IE8
var testStr = document.getElementById('str_container').innerHTML;
var parser = new DOMParser();
var doc = parser.parseFromString(testStr,'text/html');
var templateID = 'def';
var myEl = doc.body.querySelector('offering[templateid='+templateID+']');
//var id = myEl.id;
//var html = myEl.outerHTML;
console.log(myEl || 'not found');

Replacing multiple URLs in a string by a links with regex

I'm using a filter with AngularJS to replace URLs in strings by window.open() function and email by mailto :
app.filter('parseUrl', function() {
//URLs starting with http://, https://, or ftp://
var replacePattern1 = /(\b(https?|ftp):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/gim;
//URLs starting with "www." (without // before it, or it'd re-link the ones done above).
var replacePattern2 = /(^|[^\/])(www\.[\S]+(\b|$))/gim;
//Change email addresses to mailto:: links.
var replacePattern3 = /(\w+#[a-zA-Z_]+?\.[a-zA-Z]{2,6})/gim;
return function(text, target, otherProp) {
var originText = text;
if(text == undefined || text == "") {
return "";
} else {
angular.forEach(text.match(replacePattern1), function(url) {
text = text.replace(replacePattern1, "<span onclick=\"window.open('$1', '_system');\" class='link_url'>$1</span>");
});
angular.forEach(text.match(replacePattern2), function(url) {
text = text.replace(replacePattern2, "<span onclick=\"window.open('http://$2', '_system');\" class='link_url'>$1$2</span>");
});
angular.forEach(text.match(replacePattern3), function(url) {
text = text.replace(replacePattern3, "$1");
});
return text;
}
};
});
It works well with one URL or one email, but when I have two or more URLs it's not working because it replace a multiple times the URLs by a span with window.open() function. As you can see in this JSFiddle : http://jsfiddle.net/wps94/1/
Do you have an idea to avoid this ? Maybe by changing the regex ?
Thank you
Try using a single regex and a replacement function:
var replacePattern = /\b((http:\/\/|https:\/\/|ftp:\/\/|mailto:|news:)|www\.|ftp\.|[^ \,\;\:\!\)\(\""\'\<\>\f\n\r\t\v]+#)([^ \,\;\:\!\)\(\""\'\<\>\f\n\r\t\v]+)\b/gim;
return function(text, target, otherProp) {
var originText = text;
if(text == undefined || text == "") {
return "";
} else {
return text.replace(replacePattern, function($0, $1) {
var match = $0;
var protocol = $1;
if ((/^www\./i).test(match))
{
return "<span onclick=\"window.open('http://" + match + "', '_system');\" class='link_url'>" + match + "</span>";
}
if ((/^ftp\./i).test(match))
{
return "<span onclick=\"window.open('ftp://" + match + "', '_system');\" class='link_url'>" + match + "</span>";
}
if (protocol && protocol.charAt(0) === '#')
{
return "" + match + "";
}
return "<span onclick=\"window.open('" + match + "', '_system');\" class='link_url'>" + match + "</span>";
});
}
};
http://jsfiddle.net/wps94/2/
In the forEachs, change text = to newText = and then return that instead…like this:
app.filter('parseUrl', function() {
//URLs starting with http://, https://, or ftp://
var replacePattern1 = /(\b(https?|ftp):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/gim;
//URLs starting with "www." (without // before it, or it'd re-link the ones done above).
var replacePattern2 = /(^|[^\/])(www\.[\S]+(\b|$))/gim;
//Change email addresses to mailto:: links.
var replacePattern3 = /(\w+#[a-zA-Z_]+?\.[a-zA-Z]{2,6})/gim;
return function(text, target, otherProp) {
var originText = text;
if(text == undefined || text == "") {
return "";
} else {
var newText;
angular.forEach(text.match(replacePattern1), function(url) {
newText = text.replace(replacePattern1, "<span onclick=\"window.open('$1', '_system');\" class='link_url'>$1</span>");
});
angular.forEach(text.match(replacePattern2), function(url) {
newText = text.replace(replacePattern2, "<span onclick=\"window.open('http://$2', '_system');\" class='link_url'>$1$2</span>");
});
angular.forEach(text.match(replacePattern3), function(url) {
newText = text.replace(replacePattern3, "$1");
});
return newText;
}
};
});
Each pass through the forEach was looking at the value of text with the replacement already made for the first URL rather than the initial value.

XPath regex combined with preg_match

I have a simple but invisible (for me) error in code. Can you help me?
With this code in my php file:
$location = $xpath2->query("//script")->item(1)->textContent;
I got (select) this:
<script class="" type="text/javascript">
//<![CDATA[
var html = '';
var lat = 44.793530904744074;
var lang = 20.5364727973938;
if (GBrowserIsCompatible())
{
var map = new GMap2(document.getElementById("map_canvas"));
var ct = new GLatLng(lat, lang);
map.setCenter(ct, 15);
map.addControl( new GSmallMapControl() );
//map.addControl( new GHierarchicalMapTypeControl () );
var gm=new GMarker(ct);
if(html != '') {
GEvent.addListener(gm, "click", function() {
this.openInfoWindowHtml( html );
});
}
map.addOverlay(gm);
map.enableContinuousZoom();
map.enableInfoWindow();
}
//]]>
</script>
Then I try to fetch 'lat' and 'lang' with this code:
$location = $xpath2->query("//script")->item(1)->textContent;
preg_match('/var\s+lat\s+=\s+(\d+\.\d+)\s*;/', $location, $lat);
preg_match('/var\s+lang\s+=\s+(\d+\.\d+)\s*;/', $location, $lng);
$data['lat'] = $lat[1];
$data['lng'] = $lng[1];
But always show that lat and lang is 0, 0 when they should be 44.34534 and 20.5345.
PLEASE HELP! where you think that I'm wrong (my English is not very well, sorry for that)
Maybe something like below. Beware though you're trying to parse JavaScript.
preg_match('/(?:^|(?<=\s))var\s+lat \s* = \s* (?=[^;]*\d) ([+-]?\d*\.?\d*)\s*; /x', $location, $lat);
preg_match('/(?:^|(?<=\s))var\s+lang\s* = \s* (?=[^;]*\d) ([+-]?\d*\.?\d*)\s*; /x', $location, $lng);
Run sample: http://www.ideone.com/SEgVb
Or, just try to get more general information:
preg_match('/(?:^|(?<=\s))var\s+lat \s*=\s* ([^;]*) \s*; /x', ...
Try like this:
preg_match('/var\s+lat\s+=\s+([\d.-]+)/', $location, $lat);
preg_match('/var\s+lang\s+=\s+([\d.-]+)/', $location, $lng);
The [\d.-]+ matches any group with numbers . or - (lat/lon can be negative)