How to autocorrect misplaced hyphen in a word? - regex

Situation & Problem
1 .
eg:
Say, you have a paragraph.
The word sentence is broken down to sente-nce with a hyphen.
Imagine you have this sample sentence, which is a very long sente-
nce that has a word being broken down with a hyphen.
2 .
How can I detect that word sente-nce is broken down with a hyphen, and correct it into sentence?
note:
Is there any library I can use to do that (prefer Java / Python / any software)?
Using a simple regex to match all (\w)-(\w) & replace with $1$2, wont work in all cases.
eg: Imagine you have a word event-driven, it will become eventdriven, which is undesired.

You need to check if the word belongs to english vocabulary. Find all the matches, for each check if word exists in english vocabulary and if not, then change the word. Something like:
import enchant
voc = enchant.Dict("en_US")
word = "sente-nce"
voc.check(word)
It returns False if it's not a word.

Solution (may not be the best)
logic & usage
/*
#logic::
regex match all words with hypen -
loop check if those words are correct by using a dictionary
_ & fix if they have hypen misplaced
#to_use::
put your dictionary in Path path = Paths.get("words_alpha.txt"); <= https://github.com/dwyl/english-words
put your sentence to autoCorrect on in content_TESTING
execute & get output
#note::
depending on the quality of the dictionary, the results may not be good.
#note::
if your words contains "space or newline \n" -> modify the regex in String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
#note::
this is not fully tested yet
*/
code
package com.ex.main.autoCorrectHypen;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
#logic::
1. regex match all words with hypen -
2. loop check if those words are correct by using a dictionary
_ & fix if they have hypen misplaced
#to_use::
1. put your dictionary in `Path path = Paths.get("words_alpha.txt");` <= https://github.com/dwyl/english-words
2. put your sentence to autoCorrect on in `content_TESTING`
3. execute & get output
#note::
depending on the quality of the dictionary, the results may not be good.
#note::
if your words contains "space or newline \n" -> modify the regex in `String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";`
#note::
this is not fully tested yet
*/
// https://stackoverflow.com/questions/11607270/how-to-check-whether-given-string-is-a-word
// https://github.com/dwyl/english-words
// ~// https://github.com/first20hours/google-10000-english
class Dictionary {
private static HashSet<String> wordsSet = new HashSet<>();
public static void initDictionary() throws IOException {
Path path = Paths.get("words_alpha.txt");
byte[] readBytes = Files.readAllBytes(path);
String wordListContents = new String(readBytes, "UTF-8");
String[] words = wordListContents.split("\r\n"); // #atten: \r\n or \n
Collections.addAll(wordsSet, words);
}
static {
try {
initDictionary();
} catch (IOException e) {
e.printStackTrace();
}
}
public static boolean contains(String word) { return wordsSet.contains(word); }
}
public class AutoCorrectHypen {
public static String autoCorrectHypen(String content_ValidateOn) {
String content_SearchOn = content_ValidateOn;
String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
Pattern pattern = Pattern.compile(str_RegexPattern);
Matcher matcher = pattern.matcher(content_SearchOn);
StringBuilder sb_ContentSearchOn = new StringBuilder(content_SearchOn);
StringBuilder content_Replaced = new StringBuilder();
int ind_MatchGroupEnd_prev = 0;
int ind_MatchGroupEnd_curr;
int ind_MatchGroupStart_curr;
while (matcher.find()) {
//
ind_MatchGroupStart_curr = matcher.start(0);
ind_MatchGroupEnd_curr = matcher.end(0);
String content_BeforeMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, ind_MatchGroupStart_curr); // prev end to curr start, not start to end
content_Replaced.append(content_BeforeMatchGroup);
//
String content_SearchOn_innerMatch_G0 = matcher.group(0);
String content_SearchOn_innerMatch_G1 = matcher.group(1);
String content_SearchOn_innerMatch_G2 = matcher.group(2);
String content_Replaced_innerMatch = autoCorrectHypen_innerMatch(content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
content_Replaced.append(content_Replaced_innerMatch);
//
ind_MatchGroupEnd_prev = ind_MatchGroupEnd_curr;
}
System.out.println("-------");
// append the content after the last match group
String content_AfterLastMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, sb_ContentSearchOn.length());
content_Replaced.append(content_AfterLastMatchGroup);
return content_Replaced.toString();
}
protected static String autoCorrectHypen_innerMatch(String content_SearchOn_innerMatch_G0, String content_SearchOn_innerMatch_G1, String content_SearchOn_innerMatch_G2) {
System.out.printf("> %s; %s; %s; %n", content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
String content_Replaced_innerMatch = null;
// #atten: order of the if stmt matters
if (Dictionary.contains(content_SearchOn_innerMatch_G0)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G0", content_Replaced_innerMatch);
} else if (Dictionary.contains(content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2;
System.out.printf(">> %s: %n%s %n", "whole word - remove hypen, G1 + G2", content_Replaced_innerMatch);
} else if (Dictionary.contains(content_SearchOn_innerMatch_G1) && Dictionary.contains(content_SearchOn_innerMatch_G2)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G1 && G2", content_Replaced_innerMatch);
} else {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.err.println(">> No such word");
}
return content_Replaced_innerMatch;
}
//################################################################################################
static final String content_TESTING_Simple = ""
+ "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
+ "samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow";
static final String content_TESTING = ""
+ "Imagine you have this sample sentence, which is a very long sente-\n"
+ "nce that has a word being broken down with a hyphen. \n"
+ "\n"
+ "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
+ "";
public static void main(String[] args) throws Exception {
System.out.println(autoCorrectHypen(content_TESTING_Simple)); //
}
}
input
Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.
samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow
output
Check the word sentence, event-driven, family-owned, chocolate-covered, anticlockwise.
sample, difference, what-do-you-mean, however, because, otherwise, pillow

Related

How to find a string at a specific location mixed not english in java?

How to find a string at a specific location with regex?
choryangStn_110_220114_일_0.sbm
choryangStn_110_220114_이_0.sbm
choryangStn_110_220114_삼_0.sbm
At work, I would like to bring 일, 이, 삼
I tried
String filename = "choryangStn_110_220114_일_0.sbm";
filename.replaceAll(".*_(\\w+)_\\d+\\.\\w+", "$1");
If do like this, it will not work properly.
I wonder how can I satisfy \\w or [가-힣] .
filename.replaceAll(".*_(\\w+)||[가-힣]_\\d+\\.\\w+", "$1");
filename.replaceAll(".*_(\\w+||[가-힣])_\\d+\\.\\w+", "$1");
Both of the above sentences don't work properly.
I wonder how this is possible.
You can use the following regex with replaceFirst():
(?U)^.*_(\\w+)_\\d+\\.\\w+$
The (?U) is an embedded flag option that is equivalent of Pattern.UNICODE_CHARACTER_CLASS option that makes all shorthand character classes Unicode-aware.
See the regex demo and the Java demo:
import java.util.*;
import java.util.regex.*;
class Test
{
public static void main (String[] args) throws java.lang.Exception
{
String strings[] = {"choryangStn_110_220114_일_0.sbm",
"choryangStn_110_220114_이_0.sbm",
"choryangStn_110_220114_삼_0.sbm"
};
String regex = "(?U)^.*_(\\w+)_\\d+\\.\\w+$";
for(String text : strings)
{
System.out.println("'" + text + "' => '" + text.replaceFirst(regex, "$1") + "'");
}
}
}
Output:
'choryangStn_110_220114_일_0.sbm' => '일'
'choryangStn_110_220114_이_0.sbm' => '이'
'choryangStn_110_220114_삼_0.sbm' => '삼'

JavaFX - TextField with regex for zipcode

for my programm I want to use a TextField where the user can enter a zipcode (German ones). For that I tried what you can see below. If the user enters more than 5 digits every additional digit shall be deleted immediately. Of course letters are not allowed.
When I use this pattern ^[0-9]{0,5}$ on https://regex101.com/ it does what I intended to, but when I try this in JavaFX it doesn't work. But I couldn't find a solution yet.
Can anyone tell me what I did wrong?
Edit: For people, who didn't work with JavaFX yet: When the user enters just one character, the method check(String text) is called. So the result should also be true, when there are 1 to 5 digits. But not more ;-)
public class NumberTextField extends TextField{
ErrorLabel label;
NumberTextField(String text, ErrorLabel label){
setText(text);
setFont(Font.font("Calibri", 17));
setMinHeight(35);
setMinWidth(200);
setMaxWidth(200);
this.label = label;
}
NumberTextField(){}
#Override
public void replaceText(int start, int end, String text){
if(check(text)) {
super.replaceText(start, end, text);
}
}
#Override
public void replaceSelection(String text){
if(check(text)){
super.replaceSelection(text);
}
}
private boolean check(String text){
if(text.matches("^[0-9]{0,5}$")){
label.setText("Success");
label.setBlack();
return true;
} else{
return false;
}
}
You don't need to extend TextField to do this. In fact I recommend using a TextFormatter, since this is simpler to implement:
It does not require you to overwrite multiple method. You simply need to decide based on the data about the desired input, if you want to allow the change or not.
final Pattern pattern = Pattern.compile("\\d{0,5}");
TextFormatter<?> formatter = new TextFormatter<>(change -> {
if (pattern.matcher(change.getControlNewText()).matches()) {
// todo: remove error message/markup
return change; // allow this change to happen
} else {
// todo: add error message/markup
return null; // prevent change
}
});
TextField textField = new TextField();
textField.setTextFormatter(formatter);
Your original expression should be working fine, if we wish to validate a five-digits zip though, we might want to drop the 0 quantifier:
^[0-9]{5}$
^\d{5}$
For validation purposes, we might want to keep the start and end anchors, however for just testing, we can remove and see:
[0-9]{5}
\d{5}
It is likely that some other chars, would get through our inputs, which we do not wish to have.
Demo
Test
import java.util.regex.Matcher;
import java.util.regex.Pattern;
final String regex = "^[0-9]{5}$";
final String string = "01234\n"
+ "012345\n"
+ "0\n"
+ "1234";
final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
final Matcher matcher = pattern.matcher(string);
while (matcher.find()) {
System.out.println("Full match: " + matcher.group(0));
for (int i = 1; i <= matcher.groupCount(); i++) {
System.out.println("Group " + i + ": " + matcher.group(i));
}
}

REGEX: find pattern, extract data, and replace the searched pattern with a string in relations to the content of searched

As the title says, I would like to:
1) find a pattern. Ex: getting $(getThisString+100) from this string: "this is some random string $(getThisString+100)"
2) extract data. Ex: grabbing the value 100 from $(getThisString+100)
3) replace the searched pattern with a string in relations to the content of searched. Ex: replace $(getThisString+100) with 150 (100 + 50) (this 50 is any number that I just made up)
so in the end, i will need "this is some random string 150"
I'm quite new to regex, please let me know if this is possible.
Thanks a lot
How about this?
package myparser;
import static org.junit.Assert.assertEquals;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
public class ParseTest {
private final Pattern re = Pattern.compile(".*?(\\$\\(getThisString\\+(\\d+)\\)).*");
public String parseMyString(final String str) {
Matcher m = re.matcher(str);
if (m.matches()) {
int val = Integer.parseInt(m.group(2));
int newVal = val + 50;
String strStart = str.substring(0, m.start(1));
String strEnd = str.substring(m.end(1));
return parseMyString(strStart + String.valueOf(newVal) + strEnd);
} else {
return str;
}
}
#Test
public void testParseNone() {
String in = "this is some random string)";
String out = parseMyString(in);
assertEquals(in, out);
}
#Test
public void testParseOne() {
String in = "this is some random string $(getThisString+100)";
String out = parseMyString(in);
assertEquals("this is some random string 150", out);
}
#Test
public void testParseMultiple() {
String in = "this is some random string $(getThisString+100) and some more random $(getThisString+60)";
String out = parseMyString(in);
assertEquals("this is some random string 150 and some more random 110", out);
}
}
Notes:
Just in case you don't know: this is a JUnit test.
I have used recursion to parse multiple strings. In my opinion this is a lot easier to read. When you are dealing with thousands of replacements this might lead to a stack overflow though. In that case you will have to rewrite the recursion to a loop.

GWT Regex and empty string

Could someone explain why this snip :
// import com.google.gwt.regexp.shared.MatchResult;
// import com.google.gwt.regexp.shared.RegExp;
RegExp regExp = RegExp.compile("^$");
MatchResult matcher;
while ((matcher = regExp.exec("")) != null)
{
System.out.println("match " + matcher);
}
give an incredible count of matches? I tested with different modifier allowed by GWT implementation of compile(), g, i and m. It works only with m (multiline).
I just want to check for empty string.
[EDIT] the new method
private ArrayList<MatchResult> getMatches(String input, String pattern)
{
ArrayList<MatchResult> matches = new ArrayList<MatchResult>();
if(null == regExp)
{
regExp = RegExp.compile(pattern, "g");
}
if(input.isEmpty())
{
// empty string : just check if pattern validate and
// don't try to extract matches : it will resutl in infinite
// loop.
if(regExp.test(input))
{
matches.add(new MatchResult(0, "", new ArrayList<String>(0)));
}
}
else
{
for(MatchResult matcher = regExp.exec(input); matcher != null; matcher = regExp
.exec(input))
{
matches.add(matcher);
}
}
return matches;
}
Your regExp.exec("") with RegExp.compile("^$") will never return null, as the empty string "" is a match for regex ^$, which reads "nothing between beginning and the end of line/string".
So your while is infinity loop.
Also, you print is
System.out.println("match " + matcher);
...but you probably wanted to use
System.out.println("match " + matcher.getGroup(0));
Also see GWT checking if textbox is empty.

How do I check if a filename matches a wildcard pattern

I've got a wildcard pattern, perhaps "*.txt" or "POS??.dat".
I also have list of filenames in memory that I need to compare to that pattern.
How would I do that, keeping in mind I need exactly the same semantics that IO.DirectoryInfo.GetFiles(pattern) uses.
EDIT: Blindly translating this into a regex will NOT work.
I have a complete answer in code for you that's 95% like FindFiles(string).
The 5% that isn't there is the short names/long names behavior in the second note on the MSDN documentation for this function.
If you would still like to get that behavior, you'll have to complete a computation of the short name of each string you have in the input array, and then add the long name to the collection of matches if either the long or short name matches the pattern.
Here is the code:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace FindFilesRegEx
{
class Program
{
static void Main(string[] args)
{
string[] names = { "hello.t", "HelLo.tx", "HeLLo.txt", "HeLLo.txtsjfhs", "HeLLo.tx.sdj", "hAlLo20984.txt" };
string[] matches;
matches = FindFilesEmulator("hello.tx", names);
matches = FindFilesEmulator("H*o*.???", names);
matches = FindFilesEmulator("hello.txt", names);
matches = FindFilesEmulator("lskfjd30", names);
}
public string[] FindFilesEmulator(string pattern, string[] names)
{
List<string> matches = new List<string>();
Regex regex = FindFilesPatternToRegex.Convert(pattern);
foreach (string s in names)
{
if (regex.IsMatch(s))
{
matches.Add(s);
}
}
return matches.ToArray();
}
internal static class FindFilesPatternToRegex
{
private static Regex HasQuestionMarkRegEx = new Regex(#"\?", RegexOptions.Compiled);
private static Regex IllegalCharactersRegex = new Regex("[" + #"\/:<>|" + "\"]", RegexOptions.Compiled);
private static Regex CatchExtentionRegex = new Regex(#"^\s*.+\.([^\.]+)\s*$", RegexOptions.Compiled);
private static string NonDotCharacters = #"[^.]*";
public static Regex Convert(string pattern)
{
if (pattern == null)
{
throw new ArgumentNullException();
}
pattern = pattern.Trim();
if (pattern.Length == 0)
{
throw new ArgumentException("Pattern is empty.");
}
if(IllegalCharactersRegex.IsMatch(pattern))
{
throw new ArgumentException("Pattern contains illegal characters.");
}
bool hasExtension = CatchExtentionRegex.IsMatch(pattern);
bool matchExact = false;
if (HasQuestionMarkRegEx.IsMatch(pattern))
{
matchExact = true;
}
else if(hasExtension)
{
matchExact = CatchExtentionRegex.Match(pattern).Groups[1].Length != 3;
}
string regexString = Regex.Escape(pattern);
regexString = "^" + Regex.Replace(regexString, #"\\\*", ".*");
regexString = Regex.Replace(regexString, #"\\\?", ".");
if(!matchExact && hasExtension)
{
regexString += NonDotCharacters;
}
regexString += "$";
Regex regex = new Regex(regexString, RegexOptions.Compiled | RegexOptions.IgnoreCase);
return regex;
}
}
}
}
You can simply do this. You do not need regular expressions.
using Microsoft.VisualBasic.CompilerServices;
if (Operators.LikeString("pos123.txt", "pos?23.*", CompareMethod.Text))
{
Console.WriteLine("Filename matches pattern");
}
Or, in VB.Net,
If "pos123.txt" Like "pos?23.*" Then
Console.WriteLine("Filename matches pattern")
End If
In c# you could simulate this with an extension method. It wouldn't be exactly like VB Like, but it would be like...very cool.
You could translate the wildcards into a regular expression:
*.txt -> ^.+\.txt$
POS??.dat _> ^POS..\.dat$
Use the Regex.Escape method to escape the characters that are not wildcars into literal strings for the pattern (e.g. converting ".txt" to "\.txt").
The wildcard * translates into .+, and ? translates into .
Put ^ at the beginning of the pattern to match the beginning of the string, and $ at the end to match the end of the string.
Now you can use the Regex.IsMatch method to check if a file name matches the pattern.
Just call the Windows API function PathMatchSpecExW().
[Flags]
public enum MatchPatternFlags : uint
{
Normal = 0x00000000, // PMSF_NORMAL
Multiple = 0x00000001, // PMSF_MULTIPLE
DontStripSpaces = 0x00010000 // PMSF_DONT_STRIP_SPACES
}
class FileName
{
[DllImport("Shlwapi.dll", SetLastError = false)]
static extern int PathMatchSpecExW([MarshalAs(UnmanagedType.LPWStr)] string file,
[MarshalAs(UnmanagedType.LPWStr)] string spec,
MatchPatternFlags flags);
/*******************************************************************************
* Function: MatchPattern
*
* Description: Matches a file name against one or more file name patterns.
*
* Arguments: file - File name to check
* spec - Name pattern(s) to search foe
* flags - Flags to modify search condition (MatchPatternFlags)
*
* Return value: Returns true if name matches the pattern.
*******************************************************************************/
public static bool MatchPattern(string file, string spec, MatchPatternFlags flags)
{
if (String.IsNullOrEmpty(file))
return false;
if (String.IsNullOrEmpty(spec))
return true;
int result = PathMatchSpecExW(file, spec, flags);
return (result == 0);
}
}
Some kind of regex/glob is the way to go, but there are some subtleties; your question indicates you want identical semantics to IO.DirectoryInfo.GetFiles. That could be a challenge, because of the special cases involving 8.3 vs. long file names and the like. The whole story is on MSDN.
If you don't need an exact behavioral match, there are a couple of good SO questions:
glob pattern matching in .NET
How to implement glob in C#
For anyone who comes across this question now that it is years later, I found over at the MSDN social boards that the GetFiles() method will accept * and ? wildcard characters in the searchPattern parameter. (At least in .Net 3.5, 4.0, and 4.5)
Directory.GetFiles(string path, string searchPattern)
http://msdn.microsoft.com/en-us/library/wz42302f.aspx
Plz try the below code.
static void Main(string[] args)
{
string _wildCardPattern = "*.txt";
List<string> _fileNames = new List<string>();
_fileNames.Add("text_file.txt");
_fileNames.Add("csv_file.csv");
Console.WriteLine("\nFilenames that matches [{0}] pattern are : ", _wildCardPattern);
foreach (string _fileName in _fileNames)
{
CustomWildCardPattern _patetrn = new CustomWildCardPattern(_wildCardPattern);
if (_patetrn.IsMatch(_fileName))
{
Console.WriteLine("{0}", _fileName);
}
}
}
public class CustomWildCardPattern : Regex
{
public CustomWildCardPattern(string wildCardPattern)
: base(WildcardPatternToRegex(wildCardPattern))
{
}
public CustomWildCardPattern(string wildcardPattern, RegexOptions regexOptions)
: base(WildcardPatternToRegex(wildcardPattern), regexOptions)
{
}
private static string WildcardPatternToRegex(string wildcardPattern)
{
string patternWithWildcards = "^" + Regex.Escape(wildcardPattern).Replace("\\*", ".*");
patternWithWildcards = patternWithWildcards.Replace("\\?", ".") + "$";
return patternWithWildcards;
}
}
For searching against a specific pattern, it might be worth using File Globbing which allows you to use search patterns like you would in a .gitignore file.
See here: https://learn.microsoft.com/en-us/dotnet/core/extensions/file-globbing
This allows you to add both inclusions & exclusions to your search.
Please see below the example code snippet from the Microsoft Source above:
Matcher matcher = new Matcher();
matcher.AddIncludePatterns(new[] { "*.txt" });
IEnumerable<string> matchingFiles = matcher.GetResultsInFullPath(filepath);
The use of RegexOptions.IgnoreCase will fix it.
public class WildcardPattern : Regex {
public WildcardPattern(string wildCardPattern)
: base(ConvertPatternToRegex(wildCardPattern), RegexOptions.IgnoreCase) {
}
public WildcardPattern(string wildcardPattern, RegexOptions regexOptions)
: base(ConvertPatternToRegex(wildcardPattern), regexOptions) {
}
private static string ConvertPatternToRegex(string wildcardPattern) {
string patternWithWildcards = Regex.Escape(wildcardPattern).Replace("\\*", ".*");
patternWithWildcards = string.Concat("^", patternWithWildcards.Replace("\\?", "."), "$");
return patternWithWildcards;
}
}