How to avoid Switch Case & sscanf Function - c++

I am working on C++. I am writing code for date format. Using with, we can get default date format from anyother date format. So I have found 240 date formats for this task. So M want to use switch case and sscanf function. Every case have sscanf function to separate day,month, year. So I need 240 cases and 240 sscanf function. Is there any method to avoid lot of swtich and sscanf? If you have any ideas, please let me know guys.
case 0:
sscanf(tsdate.c_str(),"%2d/%2d/%4d",&day,&month,&year);
break;
case 1:
sscanf(tsdate.c_str(),"%2d-%2d-%4d",&month,&day,&year);
break;
case 2:
sscanf(tsdate.c_str(),"%2d %2d %4d",&day,&month,&year);
break;
case 3:
sscanf(tsdate.c_str(),"%2d/%2d/%2d",&day,&month,&year);
coryear(year);
break;
case 4:
sscanf(tsdate.c_str(),"%2d/%2d/%2d",&year,&month,&day);
coryear(year);
break;
Like above, I want to put 240 cases and 240 sscanf. Please let me know how to avoid a lot of cases.

you can't avoid the switch cases but you can create multiple function to avoid sscanf:
void scanDayFirst(string format)
{
sscanf(tsdate.c_str(),format,&day,&month,&year);
break;
}
void scanMonthFirst(string format)
{
sscanf(tsdate.c_str(),format,&month,&day,&year);
break;
}
and so on.....
the result will be like this:
case 0:
scanDayFirst("%2d/%2d/%4d");
case 1:
scanDayFirst("%2d-%2d-%4d");
case 2:
scanDayFirst("%2d %2d %4d");
case 3:
scanDayFirst("%2d.%2d.%4d");

enum { ITEM_YEAR, ITEM_MONTH, ITEM_DAY, NUM_ITEMS };
struct date_format { char const *fmt; int items[NUM_ITEMS]; };
struct date_format const formats[] =
{ { "%2d/%2d/%4d", { ITEM_DAY, ITEM_MONTH, ITEM_YEAR } }
, { "%2d-%2d-%4d", { ITEM_MONTH, ITEM_DAY, ITEM_YEAR } }
/* etc. */
};
int parts[NUM_ITEMS]; /* Instead of year,month,day */
sscanf(tsdate.c_str(), formats[x].fmt,
&parts[formats[x].items[0]],
&parts[formats[x].items[1]],
&parts[formats[x].items[2]]);
You can extend this to have ITEM_NONE if you want to skip an item, or add extra items, and so on.
NB. If this is C++ then consider using stream input instead of sscanf.

Instead of enumerating all cases consider instead using a custom language for the format specification:
Date x = parseDate(user_input, "dd-mm-yyyy");
it will make the function shorter, easier to document and easier to use also improving the readability of the code that uses it. The idea is using codes like
yyyy ............. 4-digits year
yy ............... 2-digits year with automatic century computation
mm ............... 2-digits month
m ................ 1 or 2 digits month
dd ............... 2-digits day
d ................ 1 or 2 digits day
anything else .... mandatory character
A simple implementation could be
Date parseDate(const std::string& input, const std::string& format) {
const char *src = input.c_str();
const char *fmt = format.c_str();
int year=-1, month=-1, day=-1;
while (*fmt) {
if (!*src) throw invalid_date(input, format);
if (strncmp(fmt, "yyyy", 4) == 0) {
fmt += 4;
year = getInt(src, 4);
} else if (strncmp(fmt, "yy", 2) == 0) {
fmt += 2;
year = guess_century(getInt(src, 2));
} else if (strncmp(fmt, "mm", 2) == 0) {
fmt += 2;
month = getInt(src, 2);
} else if (fmt[0] == 'm') {
fmt += 1;
month = getInt(src, -1);
} else if (strncmp(fmt, "dd", 2) == 0) {
fmt += 2;
day = getInt(src, 2);
} else if (fmt[0] == 'd') {
fmt += 1;
day = getInt(src, -1);
} else {
if (src[0] != fmt[0]) throw invalid_date(input, format);
src++; fmt++;
}
}
if (*src || year == -1 || month == -1 || day == -1)
throw invalid_date(input, format);
return Date(year, month, day);
}
NOTE: untested

Related

C Write / Read Data From Binary File

UPDATE
IBM HC-486 1995 11 12 228 Иванов IBM HC-476 1990 1 42 218 Васильев
So i kinda try to read two records. First one fits out well. Second looks bad.
I kinda fixed suggestions thanks a lot it helped to move forward. So for now i stuck on outputing two records.
Result is ->
mark = IBM HC-486 year = 1995 month = 11 day = 12 numroom = 228 lastname = Ивановmark = IBM HC-47 year = 6 month = 1990
day = 1 numroom = 42 lastname = 218mark = Васи� year = 6 month = 1990 day = 1 numroom = 42 lastname = �ьев
Making a binary file out of structs, attempting to print out all cointaining..
ONLY scanf/printf/FILE/struct
Here's a code...
Lab.h
#pragma once
void input();
void find();
int getdays(int year, int month);
void correction();
void print();
Lab.cpp
#include "Lab.h"
#include <stdio.h> //FILE
#include <iostream>
#include <conio.h> //getch
#include <windows.h>
#include <io.h>
struct Computer
{
wchar_t mark[11];
int year;
int month;
int day;
unsigned char numroom;
wchar_t lastname[20];
};
void input()
{
FILE *inputFile, *outputFile;
fopen_s(&outputFile, "output.dat", "wb");
fopen_s(&inputFile, "input.txt", "r");
Computer c;
while (fgetws(c.mark, 11, inputFile))
{
fscanf_s(inputFile, "%d", &c.year);
fscanf_s(inputFile, "%i", &c.month);
fscanf_s(inputFile, "%i", &c.day);
fscanf_s(inputFile, "%hhu", &c.numroom);
fwscanf_s(inputFile, L"%s", c.lastname, _countof(c.lastname));
fwrite(&c, sizeof(struct Computer), 1, outputFile);
}
_fcloseall();
return;
}
void find()
{
FILE *outputFile;
fopen_s(&outputFile, "output.dat", "rb+");
Computer c;
while (fread(&c, sizeof(struct Computer), 1, outputFile))
{
if (c.year == 1995 && wcscmp(L"IBM HC-486", c.mark) == 0)
{
wprintf_s(L"\nmark = %s year = %i month = %i day = %i numroom = %i lastname = %s",
c.mark, c.year, c.month, c.day, c.numroom, c.lastname);
_getch();
_fcloseall();
return;
}
}
_getch();
return;
}
int getdays(int year, int month)
{
int days = 0;
if (month == 4 || month == 6 || month == 9 || month == 11)
days = 30;
else if (month == 2)
{
bool leapyear = (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0);
if (leapyear == 0)
days = 28;
else
days = 29;
}
else
days = 31;
return days;
}
void correction()
{
FILE* outputFile;
fopen_s(&outputFile, "output.dat", "rb+");
fseek(outputFile, 0, 0);
Computer c;
long item = 0;
while (fread(&c, sizeof(struct Computer), 1, outputFile))
{
while (c.month < 1 || c.month > 12)
{
wprintf_s(L"mark = %s year = %i month = %i day = %i numroom = %i lastname = %s",
c.mark, c.year, c.month, c.day, c.numroom, c.lastname);
wprintf_s(L"%s%i", L"Некорректный номер месяца \nПожалуйста введите другой номер месяца:", c.month);
scanf_s("%i", &c.month);
fseek(outputFile, item * sizeof(struct Computer), 0);
fwrite(&c, sizeof(struct Computer), 1, outputFile);
}
while (c.day < 1 || c.day > getdays(c.year, c.month))
{
wprintf_s(L"mark = %s year = %i month = %i day = %i numroom = %i lastname = %s",
c.mark, c.year, c.month, c.day, c.numroom, c.lastname);
wprintf_s(L"%s%i", L"Некорректный номер дня\nПожалуйста введите другой номер дня:", c.day);
scanf_s("%i", &c.day);
fseek(outputFile, item * sizeof(struct Computer), 0);
fwrite(&c, sizeof(struct Computer), 1, outputFile);
}
item += 1;
}
_getch();
_fcloseall();
return;
}
void print()
{
FILE* outputFile;
fopen_s(&outputFile, "output.dat", "rb+");
fseek(outputFile, 0, SEEK_SET);
Computer c;
while (fread(&c, sizeof(struct Computer), 1, outputFile))
{
wprintf_s(L"mark = %s year = %d month = %i day = %i numroom = %i lastname = %s",
c.mark, c.year, c.month, c.day, c.numroom, c.lastname);
}
_getch();
_fcloseall();
return;
}
Lab2.cpp
#include <windows.h>
#include "Lab.h"
int main()
{
SetConsoleCP(65001);
SetConsoleOutputCP(65001);
input();
print();
//find();
//correction();
return 0;
}
There are two main problems with that. The first one has already been pointed out by Johnny Mopp, in that your call to fgetws requires a minimum size of c.mark of 11 elements, so you are overflowing it.
And regarding why you read 0 as the year, it's due to this overflow and the fact that you are trying to manually add the NULL terminator to c.mark:
c.mark[wcslen(c.mark) - 1] = '\0';
As you are already overflowing c.mark, this happens to go right into c.year and sets it to 0 (try putting this line right after reading c.mark and you will see that you read the correct year).
In fact, this is not necessary because fgetws already includes the NULL terminator (your call will only read 10 characters and add as character 11 the '\0'.
Event then, take into account that your attempt to add the NULL terminator is bound to fail, because wcslen does not work unless there is already a NULL terminator, so you are trying to set a NULL terminator where there is already one. Besides, you are removing the last character in the string due to the -1.
Imagine that you have a string with only one character L"A". If you make that operation, wcslen will return 1 and if you substract 1, you are doing c.str[0] = L'\0', thus converting the string to L"". In this case, it would be better using sizeof instead of wcslen, because it would return 11 regardless of the content, and substracting 1 you would get c.str[10] = '\0' which is what you really want.
Nevertheless, as I said before, it's unnecessary because fgetws already takes care of the NULL terminator for you (take a look at the Remarks section of https://learn.microsoft.com/es-es/cpp/c-runtime-library/reference/fgets-fgetws?view=msvc-160).
UPDATE
Regarding the decission on when to end reading, I usually read until I run out of data, regardless of the file size. That would mean making the loop run forever with while (true), and checking the output of the fgetws and fscanf, as others have suggested. If you take a look at the documentation of fgetws (the link I wrote before) you can see in the Return value section that it returns a pointer to the buffer on success (this is not useful normally) but it returns NULL in case of an error or end-of-file. You can use this to break the loop if there is an error when you read mark by doing:
if (fgetws(c.mark, 11, inputFile) == NULL)
break;
Similarly, fscanf_s returns EOF in case of an error or end-of-file (https://learn.microsoft.com/es-es/cpp/c-runtime-library/reference/fscanf-s-fscanf-s-l-fwscanf-s-fwscanf-s-l?view=msvc-160), so you could possibly add that condition whenever you read a value using fscanf_s. For instance:
if (fscanf_s(inputFile, "%d", &c.year) == EOF)
break;
And so with the rest. Or you could go just with the condition in fgetws, but that could lead to corrupt records if you have incomplete lines (where the fgetws succeeds but one or more of the fscanf_s fails). In the end it all boils down to how much work you want to put and how resilient do you want your code to be against invalid inputs.

How to use C++ Builder OPENARRAY in a custom function

I would like to use OPENARRAY (or an alternative, if you have one) to pass multiple placeholder variables into my function. I know that it can be used for String::Format in this way:
UnicodeString Text1 = "abc";
int Num2 = 1;
String::Format("Some %s and %d", OPENARRAY(TVarRec, (Text1, Num2));
What I would like is to use Text1 and Text2 variables in something like this:
MyFunction("Some %Txt1 and %Num2", OPENARRAY(TVarRec, ("%Txt1", Text1, "%Num2", Num2));
Or perhaps:
MyFunction(OPENARRAY(TVarRec, ("Some %Txt1 and %Num2", "%Txt1", Text1, "%Num2", Num2));
So it would accept the text and replace the placeholder variables with appropriate variable content.
What I don't know is how do I read OPENARRAY parameter content from MyFunction.
So the function would look like:
UnicodeString MyFunction(UnicodeString Txt, ?WHAT-HERE?)
{
// read openarray here and replace vars
return StringReplace(Txt, ?WHAT-HERE?);
}
So I don't know how do I accept the OPENARRAY variables. Also, if you have an alternative solution to passing placeholder, variable in similar manner (without the use of C++11), that would also be welcome.
When passing a Delphi-style Open Array to a function, two things are actually passed:
a pointer to the first element of the array
the index of the last element of the array (NOT the length of the array, as you might expect!).
So, in your example, you can declare your function like this:
UnicodeString MyFunction(UnicodeString Txt, const TVarRec *Values, const int Values_High)
And then loop through the array using normal pointer arithmetic.
TVarRec can hold many different data types, so you have to look at its VType field to know what kind of data it is actually referencing, and then access the appropriate data field. Some values (integers and single characters) are stored directly in the TVarRec itself, while others (strings and other class types) are referenced by pointer instead.
For example:
UnicodeString MyFunction(UnicodeString Txt, const TVarRec *Values, const int Values_High)
{
if (((Values_High + 1) % 2) != 0)
throw Exception("uneven number of values!");
for (int index = 0; index <= Values_High; index += 2)
{
String OldValue, NewValue;
switch (Values[index].VType)
{
case vtString:
OldValue = * static_cast<const ShortString*>(Values[index].VString);
break;
case vtPChar:
OldValue = Values[index].VPChar;
break;
case vtPWideChar:
OldValue = Values[index].VPWideChar;
break;
case vtAnsiString:
OldValue = * static_cast<const AnsiString*>(Values[index].VAnsiString);
break;
case vtWideString:
OldValue = * static_cast<const WideString*>(Values[index].VWideString);
break;
case vtUnicodeString:
OldValue = * static_cast<const UnicodeString*>(Values[index].VUnicodeString);
break;
default:
throw Exception("illegal value type at index %d!", ARRAYOFCONST(( index )) );
}
switch (Values[index+1].VType)
{
case vtInteger:
NewValue = Values[index+1].VInteger;
break;
case vtBoolean:
NewValue = Values[index+1].VBoolean;
break;
case vtChar:
NewValue = Values[index+1].VChar;
break;
case vtExtended:
NewValue = * static_cast<const Extended*>(Values[index+1].VExtended);
break;
case vtString:
NewValue = * static_cast<const ShortString*>(Values[index+1].VString);
break;
case vtPChar:
NewValue = Values[index+1].VPChar;
break;
case vtWideChar:
NewValue = Values[index+1].VWideChar;
break;
case vtPWideChar:
NewValue = Values[index+1].VPWideChar;
break;
case vtAnsiString:
NewValue = * static_cast<const AnsiString*>(Values[index+1].VAnsiString);
break;
case vtCurrency:
NewValue = * static_cast<const Currency*>(Values[index+1].VCurrency);
break;
case vtVariant:
NewValue = * static_cast<const Variant*>(Values[index+1].VVariant);
break;
case vtWideString:
NewValue = * static_cast<const WideString*>(Values[index+1].VWideString);
break;
case vtInt64:
NewValue = * static_cast<const __int64*>(Values[index+1].VInt64);
break;
case vtUnicodeString:
NewValue = * static_cast<const UnicodeString*>(Values[index+1].VUnicodeString);
break;
default:
throw Exception("illegal value type at index %d!", ARRAYOFCONST(( index )) );
}
Txt = StringReplace(Txt, OldValue, NewValue, TReplaceFlags() << rfReplaceAll);
}
return Txt;
}
MyFunction("Some %Txt1 and %Num2", OPENARRAY(TVarRec, ("%Txt1", Text1, "%Num2", Num2)) );
On a side note, when a function takes an open array of TVarRec values, you should use the ARRAYOFCONST() macro instead of the OPENARRAY() macro directly, eg:
String::Format("Some %s and %d", ARRAYOFCONST(( Text1, Num2 )) );
MyFunction("Some %Txt1 and %Num2", ARRAYOFCONST(( "%Txt1", Text1, "%Num2", Num2 )) );

Refactoring switch or if/else statement? [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 5 years ago.
Improve this question
i'm working on a school project and got some feedback from my teacher. He said that in my code there are some bad practices, he said that the switch cases could be replaced by a polymorphic approach. Only i have no clue how i could do this.
My code is receiving messages from a CAN bus. Those messages come from different devices, I check the messages from which device they come from. If there is a new device I create a object and parse the message and store the information.
This system is pretty much the same for each message.
Here is my code.
void Application::PollWhisperConnectBus()
{
HAL_GPIO_TogglePin(PORT_LED1, PIN_LED1);
whisper_connect_id_ = hcan2.pRxMsg->StdId;
if (whisper_connect_id_ >= 0x580 && whisper_connect_id_ <= 0x58F)
{
WIBDevice();
}
if (whisper_connect_id_ >= 0x590 && whisper_connect_id_ <= 0x59F)
{
BMSSDevice();
}
if (whisper_connect_id_ >= 0x5B0 && whisper_connect_id_ <= 0x5BF)
{
DCPowerCubeDevice();
}
if (whisper_connect_id_ >= 0x5C0 && whisper_connect_id_ <= 0x5CF)
{
ACPowerCubeDevice();
}
if (whisper_connect_id_ >= 0x700 && whisper_connect_id_ <= 0x70F)
{
WIBHeartBeatDevice();
}
}
This is one of the functions which checked if there is an object of the class, if so parse the message.
void Application::DCPowerCubeDevice()
{
bool found_device = false;
int device = (hcan2.pRxMsg->StdId & 0x0F) + device_instance_offset_;
WhisperConnectDevice* whisper_connect_device;
for(unsigned int i = 0; i < whisper_connect_device_list_.size(); ++i)
{
if ((whisper_connect_device = whisper_connect_device_list_.at(i)) != NULL &&
whisper_connect_device->GetClassName() == "DCPowerCube")
{
DCPowerCube* dc_powercube = dynamic_cast<DCPowerCube*>(whisper_connect_device);
if (dc_powercube != NULL)
{
if (dc_powercube->GetDevice() == device)
{
dc_powercube->ParseCanMessage(&hcan2);
found_device = true;
break;
}
}
}
}
if (!found_device)
{
WhisperConnectDevice* dc_powercube;
if ((dc_powercube = new DCPowerCube) != NULL)
{
dc_powercube->SetDevice(device);
int n2k_address = nmea2000_.FindFirstFreeCanId(n2k_address_, device_list_);
if (n2k_address != 0xFFFF)
{
dc_powercube->SetSrcCanId(n2k_address);
dc_powercube->SetDeviceInstanceOffset(device_instance_offset_);
dc_powercube->SetDeviceInstance(0x30 + device);
dc_powercube->AddressClaim(nmea2000_);
dc_powercube->SendPGN126996(nmea2000_);
dc_powercube->SendPGN126998(nmea2000_, "DCPowerCube", "", "");
device_list_.at(n2k_address) = 0x01;
}
DCPowerCube* dc_powercube2 = dynamic_cast<DCPowerCube*>(dc_powercube);
if (dc_powercube2 != NULL)
{
dc_powercube2->SetCurrentLimit(16);
}
AddToWPCDeviceList(dc_powercube);
}
}
}
void DCPowerCube::ParseCanMessage(CAN_HandleTypeDef *can_handle)
{
if (can_handle != NULL)
{
uint16_t message_index = (can_handle->pRxMsg->Data[1] << 8) + can_handle->pRxMsg->Data[2];
switch (message_index)
{
case 0x1008:
device_name_[0] = can_handle->pRxMsg->Data[4];
device_name_[1] = can_handle->pRxMsg->Data[5];
device_name_[2] = can_handle->pRxMsg->Data[6];
device_name_[3] = can_handle->pRxMsg->Data[7];
device_name_[4] = '\0';
break;
case 0x100A:
software_version_[0] = can_handle->pRxMsg->Data[4];
software_version_[1] = can_handle->pRxMsg->Data[5];
software_version_[2] = can_handle->pRxMsg->Data[6];
software_version_[3] = can_handle->pRxMsg->Data[7];
software_version_[4] = '\0';
break;
case 0x1018:
serial_number_ = can_handle->pRxMsg->Data[4] << 24 | can_handle->pRxMsg->Data[5] << 16 |
can_handle->pRxMsg->Data[6] << 8 | can_handle->pRxMsg->Data[7];
break;
case 0x2100: // DC PowerCube status
power_cube_status_ = can_handle->pRxMsg->Data[4];
io_status_bit_ = can_handle->pRxMsg->Data[5];
dip_switch_status_bit_ = can_handle->pRxMsg->Data[6];
break;
case 0x2111: // Grid voltage, current, current limit
grid_voltage_ = (can_handle->pRxMsg->Data[4] << 8) + can_handle->pRxMsg->Data[5];
grid_current_ = can_handle->pRxMsg->Data[6];
grid_current_limit_ = can_handle->pRxMsg->Data[7];
break;
case 0x2112: // Generator frequency, RPM
generator_freq_ = (can_handle->pRxMsg->Data[4] << 8) + can_handle->pRxMsg->Data[5];
rpm_ = (can_handle->pRxMsg->Data[6] << 8) + can_handle->pRxMsg->Data[7];
break;
case 0x2113: // Generator current
gen_current_phase1_ = can_handle->pRxMsg->Data[4];
gen_current_phase2_ = can_handle->pRxMsg->Data[5];
gen_current_phase3_ = can_handle->pRxMsg->Data[6];
gen_current_limit_ = can_handle->pRxMsg->Data[7];
break;
case 0x2114: // Load percentage
grid_load_ = can_handle->pRxMsg->Data[4];
generator_load_ = can_handle->pRxMsg->Data[5];
dc_output_load_ = can_handle->pRxMsg->Data[6];
break;
case 0x2151: // Battery type & charger state
battery_type_ = can_handle->pRxMsg->Data[4];
charger_state_ = can_handle->pRxMsg->Data[5];
break;
case 0x2152: // DC output voltage & DC slave voltage
dc_output_voltage_ = (can_handle->pRxMsg->Data[4] << 8) + can_handle->pRxMsg->Data[5];
dc_slave_voltage_ = (can_handle->pRxMsg->Data[6] << 8) + can_handle->pRxMsg->Data[7];
break;
case 0x2153: // DC output current & DC output current limit
dc_output_current_ = (can_handle->pRxMsg->Data[4] << 8) + can_handle->pRxMsg->Data[5];
dc_output_current_limit_ = (can_handle->pRxMsg->Data[6] << 8) + can_handle->pRxMsg->Data[7];
break;
case 0x21A0: // Temperature sensor
temp_sens_BTS_ = can_handle->pRxMsg->Data[4];
temp_sens_intern1_ = can_handle->pRxMsg->Data[5];
temp_sens_intern2_ = can_handle->pRxMsg->Data[6];
temp_sens_intern3_ = can_handle->pRxMsg->Data[7];
break;
case 0x21A1:
break;
}
}
}
The WhisperConnectDevice is the base class of DCPowerCube.
I would love to get some feedback on how to approach this problem.
Whether or not you introduce polymorphism it appears you have to map an externally provided type number (ID) to code so you will always need some structure inbetween.
Your candidates are:
A block of if statements probably if-else-if...
A switch statement (if values are ameanable)
Some kind of look-up table (array, associative map, other...)
You've already got if but could improve with if-else-if.
That is normally considered the ugliest high-maintenance potential coding hot-spot approach. Coding hot-spot because all new IDs return to this code block.
I also notice in this case all your ranges are 0xnn0 to 0xnnF inclusive for some nn so you can at least simplify by reducing out the low 4 bits:
auto whisper_connect_type = whisper_connect_id_ >> 4;
Your switch option is then simplified to:
switch(whisper_connect_type) {
case 0x58: WIBDevice(); break;
case 0x59: BMSSDevice(); break;
case 0x5B: DCPowerCubeDevice(); break;
case 0x5C: ACPowerCubeDevice(); break;
case 0x70: WIBHeartBeatDevice(); break;
default: HandleUnknownDeviceIDError(whisper_connect_id_); break;
}
NB: I very strongly recommend some code to handle an unsupported ID. My advice is throwing an exception or something leading to termination. The break; is for completeness. I don't think you're coming back from an unknown ID.
An alternative is to define an associative map:
#include <iostream>
#include <unordered_map>
#include <memory>
class WhisperHandler {
public:
virtual void HandleWhisper() const = 0 ;
virtual ~WhisperHandler() {}
};
class WhisperHandlerWIBDevice : public WhisperHandler {
public:
void HandleWhisper() const override {
std::cout << "Handler WIBDevice...\n";
}
} ;
int main() {
std::unordered_map<unsigned,std::unique_ptr<const WhisperHandler>> handlers;
//...
std::unique_ptr<const WhisperHandler> handler(std::make_unique<const WhisperHandlerWIBDevice>());
std::pair<const unsigned , std::unique_ptr<const WhisperHandler> > pair({0x5B,std::move(handler)});
handlers.insert(std::move(pair));
//...
{
const auto &chandlers=handlers;
auto handlerit(chandlers.find(0x5B1));
if(handlerit!=chandlers.end()){
handlerit->second->HandleWhisper();
}else{
//ERROR - UNKNOWN HANDLER.
}
}
return 0;
}
I would suggest however you're only going to get return on investment for all this polymorphic machinery if you're going to allow the registration of handlers dynamically either from different modules of the application or by dynamically loading libraries that register themselves on load.
If it's a single project application (which it appears to be) then the switch table dispatch will probably work fine.
Because applications tend to communicate using IDs of some kind OO can start to look cumbersome when it in practice it needs to take an ID, map it to a polymorphic handler and then call the handler. Logically you've done the ID to logic mapping twice!
Footnote: The trick of knocking out the lowest 4-bits is somewhat separate from these methods and (of course) slightly fragile if the lower 4 bits become relevant to determining the handler down the line.

How to find out the next time when the clock will be adjusted for Daylight Saving?

I'm curious, if there's any way to find out the UTC date/time when the next Daylight Saving adjustment will take place?
Something akin to what Windows reports (see circled):
This information is provided in Windows by the EnumDynamicTimeZoneInformation function.
See http://msdn.microsoft.com/en-us/library/windows/desktop/hh706893%28v=vs.85%29.aspx
There is a database that has code and data: http://www.iana.org/time-zones
I don't think there's a specific API for this. I would just do a binary search, using localtime (and maybe time and mktime) from <ctime> (C++) or <time.h> (C).
A basic approach is to scan ahead three months at a time until the tm_isdst flag in the returned data structure is flipped. Then you can start binary searching between the last two two dates to figure out exactly when it flips.
See http://www.cplusplus.com/reference/ctime/tm/ for reference material.
I appreciate all your replies. And, yes, indeed I was asking about a WinAPI for Windows.
I did more research and came up with the following method that does what I wanted. It uses C++ and MFC's COleDateTime for easier date/time calculations. Other than that it's just C++ and WinAPIs. Please check if I understood the documentation for the DYNAMIC_TIME_ZONE_INFORMATION correctly. Here's the code:
int GetNextDaylightSavingAdjustmentTime(SYSTEMTIME* pOutDtNextDST_Local, int* pnOutAdjustmentMin)
{
//Get next time when DST adjustment will take place
//'pOutDtNextDST_Local' = if not NULL, receives the (local) time when next DST adjustment will take place
//'pnOutAdjustmentMin' = if not NULL, receives the amount of adjustment in minutes
//RETURN:
// = 1 if got the time, or
// = 0 if DST is not used
// = -1 if error (check GetLastError() for info)
int nOSError = NO_ERROR;
//Load API dynamically (in case of Windows XP)
BOOL (WINAPI *pfnGetDynamicTimeZoneInformation)(PDYNAMIC_TIME_ZONE_INFORMATION);
(FARPROC&)pfnGetDynamicTimeZoneInformation =
::GetProcAddress(::GetModuleHandle(L"Kernel32.dll"), "GetDynamicTimeZoneInformation");
DWORD tzID;
SYSTEMTIME StandardDate;
SYSTEMTIME DaylightDate;
int nBiasDaylight;
//Use newer API if possible
if(pfnGetDynamicTimeZoneInformation)
{
DYNAMIC_TIME_ZONE_INFORMATION dtzi = {0};
tzID = pfnGetDynamicTimeZoneInformation(&dtzi);
StandardDate = dtzi.StandardDate;
DaylightDate = dtzi.DaylightDate;
nBiasDaylight = dtzi.DaylightBias;
}
else
{
//Older API
TIME_ZONE_INFORMATION tzi = {0};
tzID = GetTimeZoneInformation(&tzi);
StandardDate = tzi.StandardDate;
DaylightDate = tzi.DaylightDate;
nBiasDaylight = tzi.DaylightBias;
}
int nRes = -1;
int nAdjMins = 0;
SYSTEMTIME stDstChange;
memset(&stDstChange, 0, sizeof(stDstChange));
SYSTEMTIME stDst;
if(tzID == TIME_ZONE_ID_STANDARD ||
tzID == TIME_ZONE_ID_DAYLIGHT)
{
stDst = tzID != TIME_ZONE_ID_DAYLIGHT ? DaylightDate : StandardDate;
if(stDst.wMonth >= 1 &&
stDst.wMonth <= 12 &&
stDst.wDay >= 1 &&
stDst.wDayOfWeek >= 0 &&
stDst.wDayOfWeek <= 6)
{
//Get adjustment bias
nAdjMins = tzID != TIME_ZONE_ID_DAYLIGHT ? -nBiasDaylight : nBiasDaylight;
if(stDst.wYear == 0)
{
//Relative date
SYSTEMTIME stLocal;
::GetLocalTime(&stLocal);
//Begin from the 1st day of the month &
//make sure that the date is in the future
COleDateTime dt;
for(int nYear = stLocal.wYear;; nYear++)
{
dt.SetDateTime(nYear, stDst.wMonth, 1, stDst.wHour, stDst.wMinute, stDst.wSecond);
if(dt > COleDateTime::GetCurrentTime())
break;
}
int nRequiredWeek = stDst.wDay >= 1 && stDst.wDay <= 5 ? stDst.wDay : 5;
for(int nCntDOW = 1;;)
{
//0=Sunday, 1=Monday; 2=Tuesday; 3=Wednesday; 4=Thursday; 5=Friday; 6=Saturday
int dow = dt.GetDayOfWeek() - 1;
ASSERT(dow >= 0 && dow <= 6);
if(dow == stDst.wDayOfWeek)
{
if(nCntDOW >= nRequiredWeek)
{
//Stop
break;
}
else
{
nCntDOW++;
}
}
//Go to next day
dt += COleDateTimeSpan(1, 0, 0, 0);
}
//Convert back to system time
if(dt.GetAsSystemTime(stDstChange))
{
//Success
nRes = 1;
}
else
{
//Failed
nOSError = ERROR_INVALID_FUNCTION;
ASSERT(NULL);
}
}
else
{
//Absolute date
stDstChange = stDst;
nRes = 1;
}
}
else
{
//Failed
nOSError = ERROR_INVALID_PARAMETER;
ASSERT(NULL);
}
}
else
{
//DST is not used
if(tzID == TIME_ZONE_ID_UNKNOWN)
{
nRes = 0;
}
else
{
//Error
nOSError = ERROR_INVALID_DATA;
ASSERT(NULL);
}
}
if(pOutDtNextDST_Local)
*pOutDtNextDST_Local = stDstChange;
if(pnOutAdjustmentMin)
*pnOutAdjustmentMin = nAdjMins;
::SetLastError(nOSError);
return nRes;
}
PS. And scratch my request for the UTC time. As I learned, it is easier to deal with local time in this situation.

How does Stack Overflow generate its SEO-friendly URLs?

What is a good complete regular expression or some other process that would take the title:
How do you change a title to be part of the URL like Stack Overflow?
and turn it into
how-do-you-change-a-title-to-be-part-of-the-url-like-stack-overflow
that is used in the SEO-friendly URLs on Stack Overflow?
The development environment I am using is Ruby on Rails, but if there are some other platform-specific solutions (.NET, PHP, Django), I would love to see those too.
I am sure I (or another reader) will come across the same problem on a different platform down the line.
I am using custom routes, and I mainly want to know how to alter the string to all special characters are removed, it's all lowercase, and all whitespace is replaced.
Here's how we do it. Note that there are probably more edge conditions than you realize at first glance.
This is the second version, unrolled for 5x more performance (and yes, I benchmarked it). I figured I'd optimize it because this function can be called hundreds of times per page.
/// <summary>
/// Produces optional, URL-friendly version of a title, "like-this-one".
/// hand-tuned for speed, reflects performance refactoring contributed
/// by John Gietzen (user otac0n)
/// </summary>
public static string URLFriendly(string title)
{
if (title == null) return "";
const int maxlen = 80;
int len = title.Length;
bool prevdash = false;
var sb = new StringBuilder(len);
char c;
for (int i = 0; i < len; i++)
{
c = title[i];
if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
{
sb.Append(c);
prevdash = false;
}
else if (c >= 'A' && c <= 'Z')
{
// tricky way to convert to lowercase
sb.Append((char)(c | 32));
prevdash = false;
}
else if (c == ' ' || c == ',' || c == '.' || c == '/' ||
c == '\\' || c == '-' || c == '_' || c == '=')
{
if (!prevdash && sb.Length > 0)
{
sb.Append('-');
prevdash = true;
}
}
else if ((int)c >= 128)
{
int prevlen = sb.Length;
sb.Append(RemapInternationalCharToAscii(c));
if (prevlen != sb.Length) prevdash = false;
}
if (i == maxlen) break;
}
if (prevdash)
return sb.ToString().Substring(0, sb.Length - 1);
else
return sb.ToString();
}
To see the previous version of the code this replaced (but is functionally equivalent to, and 5x faster), view revision history of this post (click the date link).
Also, the RemapInternationalCharToAscii method source code can be found here.
Here is my version of Jeff's code. I've made the following changes:
The hyphens were appended in such a way that one could be added, and then need removing as it was the last character in the string. That is, we never want “my-slug-”. This means an extra string allocation to remove it on this edge case. I’ve worked around this by delay-hyphening. If you compare my code to Jeff’s the logic for this is easy to follow.
His approach is purely lookup based and missed a lot of characters I found in examples while researching on Stack Overflow. To counter this, I first peform a normalisation pass (AKA collation mentioned in Meta Stack Overflow question Non US-ASCII characters dropped from full (profile) URL), and then ignore any characters outside the acceptable ranges. This works most of the time...
... For when it doesn’t I’ve also had to add a lookup table. As mentioned above, some characters don’t map to a low ASCII value when normalised. Rather than drop these I’ve got a manual list of exceptions that is doubtless full of holes, but it is better than nothing. The normalisation code was inspired by Jon Hanna’s great post in Stack Overflow question How can I remove accents on a string?.
The case conversion is now also optional.
public static class Slug
{
public static string Create(bool toLower, params string[] values)
{
return Create(toLower, String.Join("-", values));
}
/// <summary>
/// Creates a slug.
/// References:
/// http://www.unicode.org/reports/tr15/tr15-34.html
/// https://meta.stackexchange.com/questions/7435/non-us-ascii-characters-dropped-from-full-profile-url/7696#7696
/// https://stackoverflow.com/questions/25259/how-do-you-include-a-webpage-title-as-part-of-a-webpage-url/25486#25486
/// https://stackoverflow.com/questions/3769457/how-can-i-remove-accents-on-a-string
/// </summary>
/// <param name="toLower"></param>
/// <param name="normalised"></param>
/// <returns></returns>
public static string Create(bool toLower, string value)
{
if (value == null)
return "";
var normalised = value.Normalize(NormalizationForm.FormKD);
const int maxlen = 80;
int len = normalised.Length;
bool prevDash = false;
var sb = new StringBuilder(len);
char c;
for (int i = 0; i < len; i++)
{
c = normalised[i];
if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
{
if (prevDash)
{
sb.Append('-');
prevDash = false;
}
sb.Append(c);
}
else if (c >= 'A' && c <= 'Z')
{
if (prevDash)
{
sb.Append('-');
prevDash = false;
}
// Tricky way to convert to lowercase
if (toLower)
sb.Append((char)(c | 32));
else
sb.Append(c);
}
else if (c == ' ' || c == ',' || c == '.' || c == '/' || c == '\\' || c == '-' || c == '_' || c == '=')
{
if (!prevDash && sb.Length > 0)
{
prevDash = true;
}
}
else
{
string swap = ConvertEdgeCases(c, toLower);
if (swap != null)
{
if (prevDash)
{
sb.Append('-');
prevDash = false;
}
sb.Append(swap);
}
}
if (sb.Length == maxlen)
break;
}
return sb.ToString();
}
static string ConvertEdgeCases(char c, bool toLower)
{
string swap = null;
switch (c)
{
case 'ı':
swap = "i";
break;
case 'ł':
swap = "l";
break;
case 'Ł':
swap = toLower ? "l" : "L";
break;
case 'đ':
swap = "d";
break;
case 'ß':
swap = "ss";
break;
case 'ø':
swap = "o";
break;
case 'Þ':
swap = "th";
break;
}
return swap;
}
}
For more details, the unit tests, and an explanation of why Facebook's URL scheme is a little smarter than Stack Overflows, I've got an expanded version of this on my blog.
You will want to setup a custom route to point the URL to the controller that will handle it. Since you are using Ruby on Rails, here is an introduction in using their routing engine.
In Ruby, you will need a regular expression like you already know and here is the regular expression to use:
def permalink_for(str)
str.gsub(/[^\w\/]|[!\(\)\.]+/, ' ').strip.downcase.gsub(/\ +/, '-')
end
You can also use this JavaScript function for in-form generation of the slug's (this one is based on/copied from Django):
function makeSlug(urlString, filter) {
// Changes, e.g., "Petty theft" to "petty_theft".
// Remove all these words from the string before URLifying
if(filter) {
removelist = ["a", "an", "as", "at", "before", "but", "by", "for", "from",
"is", "in", "into", "like", "of", "off", "on", "onto", "per",
"since", "than", "the", "this", "that", "to", "up", "via", "het", "de", "een", "en",
"with"];
}
else {
removelist = [];
}
s = urlString;
r = new RegExp('\\b(' + removelist.join('|') + ')\\b', 'gi');
s = s.replace(r, '');
s = s.replace(/[^-\w\s]/g, ''); // Remove unneeded characters
s = s.replace(/^\s+|\s+$/g, ''); // Trim leading/trailing spaces
s = s.replace(/[-\s]+/g, '-'); // Convert spaces to hyphens
s = s.toLowerCase(); // Convert to lowercase
return s; // Trim to first num_chars characters
}
For good measure, here's the PHP function in WordPress that does it... I'd think that WordPress is one of the more popular platforms that uses fancy links.
function sanitize_title_with_dashes($title) {
$title = strip_tags($title);
// Preserve escaped octets.
$title = preg_replace('|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title);
// Remove percent signs that are not part of an octet.
$title = str_replace('%', '', $title);
// Restore octets.
$title = preg_replace('|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title);
$title = remove_accents($title);
if (seems_utf8($title)) {
if (function_exists('mb_strtolower')) {
$title = mb_strtolower($title, 'UTF-8');
}
$title = utf8_uri_encode($title, 200);
}
$title = strtolower($title);
$title = preg_replace('/&.+?;/', '', $title); // kill entities
$title = preg_replace('/[^%a-z0-9 _-]/', '', $title);
$title = preg_replace('/\s+/', '-', $title);
$title = preg_replace('|-+|', '-', $title);
$title = trim($title, '-');
return $title;
}
This function as well as some of the supporting functions can be found in wp-includes/formatting.php.
If you are using Rails edge, you can rely on Inflector.parametrize - here's the example from the documentation:
class Person
def to_param
"#{id}-#{name.parameterize}"
end
end
#person = Person.find(1)
# => #<Person id: 1, name: "Donald E. Knuth">
<%= link_to(#person.name, person_path(#person)) %>
# => Donald E. Knuth
Also if you need to handle more exotic characters such as accents (éphémère) in previous version of Rails, you can use a mixture of PermalinkFu and DiacriticsFu:
DiacriticsFu::escape("éphémère")
=> "ephemere"
DiacriticsFu::escape("räksmörgås")
=> "raksmorgas"
I am not familiar with Ruby on Rails, but the following is (untested) PHP code. You can probably translate this very quickly to Ruby on Rails if you find it useful.
$sURL = "This is a title to convert to URL-format. It has 1 number in it!";
// To lower-case
$sURL = strtolower($sURL);
// Replace all non-word characters with spaces
$sURL = preg_replace("/\W+/", " ", $sURL);
// Remove trailing spaces (so we won't end with a separator)
$sURL = trim($sURL);
// Replace spaces with separators (hyphens)
$sURL = str_replace(" ", "-", $sURL);
echo $sURL;
// outputs: this-is-a-title-to-convert-to-url-format-it-has-1-number-in-it
I hope this helps.
I don't much about Ruby or Rails, but in Perl, this is what I would do:
my $title = "How do you change a title to be part of the url like Stackoverflow?";
my $url = lc $title; # Change to lower case and copy to URL.
$url =~ s/^\s+//g; # Remove leading spaces.
$url =~ s/\s+$//g; # Remove trailing spaces.
$url =~ s/\s+/\-/g; # Change one or more spaces to single hyphen.
$url =~ s/[^\w\-]//g; # Remove any non-word characters.
print "$title\n$url\n";
I just did a quick test and it seems to work. Hopefully this is relatively easy to translate to Ruby.
T-SQL implementation, adapted from dbo.UrlEncode:
CREATE FUNCTION dbo.Slug(#string varchar(1024))
RETURNS varchar(3072)
AS
BEGIN
DECLARE #count int, #c char(1), #i int, #slug varchar(3072)
SET #string = replace(lower(ltrim(rtrim(#string))),' ','-')
SET #count = Len(#string)
SET #i = 1
SET #slug = ''
WHILE (#i <= #count)
BEGIN
SET #c = substring(#string, #i, 1)
IF #c LIKE '[a-z0-9--]'
SET #slug = #slug + #c
SET #i = #i +1
END
RETURN #slug
END
I know it's very old question but since most of the browsers now support unicode urls I found a great solution in XRegex that converts everything except letters (in all languages to '-').
That can be done in several programming languages.
The pattern is \\p{^L}+ and then you just need to use it to replace all non letters to '-'.
Working example in node.js with xregex module.
var text = 'This ! can # have # several $ letters % from different languages such as עברית or Español';
var slugRegEx = XRegExp('((?!\\d)\\p{^L})+', 'g');
var slug = XRegExp.replace(text, slugRegEx, '-').toLowerCase();
console.log(slug) ==> "this-can-have-several-letters-from-different-languages-such-as-עברית-or-español"
Assuming that your model class has a title attribute, you can simply override the to_param method within the model, like this:
def to_param
title.downcase.gsub(/ /, '-')
end
This Railscast episode has all the details. You can also ensure that the title only contains valid characters using this:
validates_format_of :title, :with => /^[a-z0-9-]+$/,
:message => 'can only contain letters, numbers and hyphens'
Brian's code, in Ruby:
title.downcase.strip.gsub(/\ /, '-').gsub(/[^\w\-]/, '')
downcase turns the string to lowercase, strip removes leading and trailing whitespace, the first gsub call globally substitutes spaces with dashes, and the second removes everything that isn't a letter or a dash.
There is a small Ruby on Rails plugin called PermalinkFu, that does this. The escape method does the transformation into a string that is suitable for a URL. Have a look at the code; that method is quite simple.
To remove non-ASCII characters it uses the iconv lib to translate to 'ascii//ignore//translit' from 'utf-8'. Spaces are then turned into dashes, everything is downcased, etc.
You can use the following helper method. It can convert the Unicode characters.
public static string ConvertTextToSlug(string s)
{
StringBuilder sb = new StringBuilder();
bool wasHyphen = true;
foreach (char c in s)
{
if (char.IsLetterOrDigit(c))
{
sb.Append(char.ToLower(c));
wasHyphen = false;
}
else
if (char.IsWhiteSpace(c) && !wasHyphen)
{
sb.Append('-');
wasHyphen = true;
}
}
// Avoid trailing hyphens
if (wasHyphen && sb.Length > 0)
sb.Length--;
return sb.ToString().Replace("--","-");
}
Here's my (slower, but fun to write) version of Jeff's code:
public static string URLFriendly(string title)
{
char? prevRead = null,
prevWritten = null;
var seq =
from c in title
let norm = RemapInternationalCharToAscii(char.ToLowerInvariant(c).ToString())[0]
let keep = char.IsLetterOrDigit(norm)
where prevRead.HasValue || keep
let replaced = keep ? norm
: prevWritten != '-' ? '-'
: (char?)null
where replaced != null
let s = replaced + (prevRead == null ? ""
: norm == '#' && "cf".Contains(prevRead.Value) ? "sharp"
: norm == '+' ? "plus"
: "")
let _ = prevRead = norm
from written in s
let __ = prevWritten = written
select written;
const int maxlen = 80;
return string.Concat(seq.Take(maxlen)).TrimEnd('-');
}
public static string RemapInternationalCharToAscii(string text)
{
var seq = text.Normalize(NormalizationForm.FormD)
.Where(c => CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark);
return string.Concat(seq).Normalize(NormalizationForm.FormC);
}
My test string:
" I love C#, F#, C++, and... Crème brûlée!!! They see me codin'... they hatin'... tryin' to catch me codin' dirty... "
The stackoverflow solution is great, but modern browser (excluding IE, as usual) now handle nicely utf8 encoding:
So I upgraded the proposed solution:
public static string ToFriendlyUrl(string title, bool useUTF8Encoding = false)
{
...
else if (c >= 128)
{
int prevlen = sb.Length;
if (useUTF8Encoding )
{
sb.Append(HttpUtility.UrlEncode(c.ToString(CultureInfo.InvariantCulture),Encoding.UTF8));
}
else
{
sb.Append(RemapInternationalCharToAscii(c));
}
...
}
Full Code on Pastebin
Edit: Here's the code for RemapInternationalCharToAscii method (that's missing in the pastebin).
I liked the way this is done without using regular expressions, so I ported it to PHP. I just added a function called is_between to check characters:
function is_between($val, $min, $max)
{
$val = (int) $val; $min = (int) $min; $max = (int) $max;
return ($val >= $min && $val <= $max);
}
function international_char_to_ascii($char)
{
if (mb_strpos('àåáâäãåa', $char) !== false)
{
return 'a';
}
if (mb_strpos('èéêëe', $char) !== false)
{
return 'e';
}
if (mb_strpos('ìíîïi', $char) !== false)
{
return 'i';
}
if (mb_strpos('òóôõö', $char) !== false)
{
return 'o';
}
if (mb_strpos('ùúûüuu', $char) !== false)
{
return 'u';
}
if (mb_strpos('çccc', $char) !== false)
{
return 'c';
}
if (mb_strpos('zzž', $char) !== false)
{
return 'z';
}
if (mb_strpos('ssšs', $char) !== false)
{
return 's';
}
if (mb_strpos('ñn', $char) !== false)
{
return 'n';
}
if (mb_strpos('ýÿ', $char) !== false)
{
return 'y';
}
if (mb_strpos('gg', $char) !== false)
{
return 'g';
}
if (mb_strpos('r', $char) !== false)
{
return 'r';
}
if (mb_strpos('l', $char) !== false)
{
return 'l';
}
if (mb_strpos('d', $char) !== false)
{
return 'd';
}
if (mb_strpos('ß', $char) !== false)
{
return 'ss';
}
if (mb_strpos('Þ', $char) !== false)
{
return 'th';
}
if (mb_strpos('h', $char) !== false)
{
return 'h';
}
if (mb_strpos('j', $char) !== false)
{
return 'j';
}
return '';
}
function url_friendly_title($url_title)
{
if (empty($url_title))
{
return '';
}
$url_title = mb_strtolower($url_title);
$url_title_max_length = 80;
$url_title_length = mb_strlen($url_title);
$url_title_friendly = '';
$url_title_dash_added = false;
$url_title_char = '';
for ($i = 0; $i < $url_title_length; $i++)
{
$url_title_char = mb_substr($url_title, $i, 1);
if (strlen($url_title_char) == 2)
{
$url_title_ascii = ord($url_title_char[0]) * 256 + ord($url_title_char[1]) . "\r\n";
}
else
{
$url_title_ascii = ord($url_title_char);
}
if (is_between($url_title_ascii, 97, 122) || is_between($url_title_ascii, 48, 57))
{
$url_title_friendly .= $url_title_char;
$url_title_dash_added = false;
}
elseif(is_between($url_title_ascii, 65, 90))
{
$url_title_friendly .= chr(($url_title_ascii | 32));
$url_title_dash_added = false;
}
elseif($url_title_ascii == 32 || $url_title_ascii == 44 || $url_title_ascii == 46 || $url_title_ascii == 47 || $url_title_ascii == 92 || $url_title_ascii == 45 || $url_title_ascii == 47 || $url_title_ascii == 95 || $url_title_ascii == 61)
{
if (!$url_title_dash_added && mb_strlen($url_title_friendly) > 0)
{
$url_title_friendly .= chr(45);
$url_title_dash_added = true;
}
}
else if ($url_title_ascii >= 128)
{
$url_title_previous_length = mb_strlen($url_title_friendly);
$url_title_friendly .= international_char_to_ascii($url_title_char);
if ($url_title_previous_length != mb_strlen($url_title_friendly))
{
$url_title_dash_added = false;
}
}
if ($i == $url_title_max_length)
{
break;
}
}
if ($url_title_dash_added)
{
return mb_substr($url_title_friendly, 0, -1);
}
else
{
return $url_title_friendly;
}
}
Now all Browser handle nicely utf8 encoding, so you can use WebUtility.UrlEncode Method , its like HttpUtility.UrlEncode used by #giamin but its work outside of a web application.
I ported the code to TypeScript. It can easily be adapted to JavaScript.
I am adding a .contains method to the String prototype, if you're targeting the latest browsers or ES6 you can use .includes instead.
if (!String.prototype.contains) {
String.prototype.contains = function (check) {
return this.indexOf(check, 0) !== -1;
};
}
declare interface String {
contains(check: string): boolean;
}
export function MakeUrlFriendly(title: string) {
if (title == null || title == '')
return '';
const maxlen = 80;
let len = title.length;
let prevdash = false;
let result = '';
let c: string;
let cc: number;
let remapInternationalCharToAscii = function (c: string) {
let s = c.toLowerCase();
if ("àåáâäãåą".contains(s)) {
return "a";
}
else if ("èéêëę".contains(s)) {
return "e";
}
else if ("ìíîïı".contains(s)) {
return "i";
}
else if ("òóôõöøőð".contains(s)) {
return "o";
}
else if ("ùúûüŭů".contains(s)) {
return "u";
}
else if ("çćčĉ".contains(s)) {
return "c";
}
else if ("żźž".contains(s)) {
return "z";
}
else if ("śşšŝ".contains(s)) {
return "s";
}
else if ("ñń".contains(s)) {
return "n";
}
else if ("ýÿ".contains(s)) {
return "y";
}
else if ("ğĝ".contains(s)) {
return "g";
}
else if (c == 'ř') {
return "r";
}
else if (c == 'ł') {
return "l";
}
else if (c == 'đ') {
return "d";
}
else if (c == 'ß') {
return "ss";
}
else if (c == 'Þ') {
return "th";
}
else if (c == 'ĥ') {
return "h";
}
else if (c == 'ĵ') {
return "j";
}
else {
return "";
}
};
for (let i = 0; i < len; i++) {
c = title[i];
cc = c.charCodeAt(0);
if ((cc >= 97 /* a */ && cc <= 122 /* z */) || (cc >= 48 /* 0 */ && cc <= 57 /* 9 */)) {
result += c;
prevdash = false;
}
else if ((cc >= 65 && cc <= 90 /* A - Z */)) {
result += c.toLowerCase();
prevdash = false;
}
else if (c == ' ' || c == ',' || c == '.' || c == '/' || c == '\\' || c == '-' || c == '_' || c == '=') {
if (!prevdash && result.length > 0) {
result += '-';
prevdash = true;
}
}
else if (cc >= 128) {
let prevlen = result.length;
result += remapInternationalCharToAscii(c);
if (prevlen != result.length) prevdash = false;
}
if (i == maxlen) break;
}
if (prevdash)
return result.substring(0, result.length - 1);
else
return result;
}
No, no, no. You are all so very wrong. Except for the diacritics-fu stuff, you're getting there, but what about Asian characters (shame on Ruby developers for not considering their nihonjin brethren).
Firefox and Safari both display non-ASCII characters in the URL, and frankly they look great. It is nice to support links like 'http://somewhere.com/news/read/お前たちはアホじゃないかい'.
So here's some PHP code that'll do it, but I just wrote it and haven't stress tested it.
<?php
function slug($str)
{
$args = func_get_args();
array_filter($args); //remove blanks
$slug = mb_strtolower(implode('-', $args));
$real_slug = '';
$hyphen = '';
foreach(SU::mb_str_split($slug) as $c)
{
if (strlen($c) > 1 && mb_strlen($c)===1)
{
$real_slug .= $hyphen . $c;
$hyphen = '';
}
else
{
switch($c)
{
case '&':
$hyphen = $real_slug ? '-and-' : '';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
$real_slug .= $hyphen . $c;
$hyphen = '';
break;
default:
$hyphen = $hyphen ? $hyphen : ($real_slug ? '-' : '');
}
}
}
return $real_slug;
}
Example:
$str = "~!##$%^&*()_+-=[]\{}|;':\",./<>?\n\r\t\x07\x00\x04 コリン ~!##$%^&*()_+-=[]\{}|;':\",./<>?\n\r\t\x07\x00\x04 トーマス ~!##$%^&*()_+-=[]\{}|;':\",./<>?\n\r\t\x07\x00\x04 アーノルド ~!##$%^&*()_+-=[]\{}|;':\",./<>?\n\r\t\x07\x00\x04";
echo slug($str);
Outputs:
コリン-and-トーマス-and-アーノルド
The '-and-' is because &'s get changed to '-and-'.
Rewrite of Jeff's code to be more concise
public static string RemapInternationalCharToAscii(char c)
{
var s = c.ToString().ToLowerInvariant();
var mappings = new Dictionary<string, string>
{
{ "a", "àåáâäãåą" },
{ "c", "çćčĉ" },
{ "d", "đ" },
{ "e", "èéêëę" },
{ "g", "ğĝ" },
{ "h", "ĥ" },
{ "i", "ìíîïı" },
{ "j", "ĵ" },
{ "l", "ł" },
{ "n", "ñń" },
{ "o", "òóôõöøőð" },
{ "r", "ř" },
{ "s", "śşšŝ" },
{ "ss", "ß" },
{ "th", "Þ" },
{ "u", "ùúûüŭů" },
{ "y", "ýÿ" },
{ "z", "żźž" }
};
foreach(var mapping in mappings)
{
if (mapping.Value.Contains(s))
return mapping.Key;
}
return string.Empty;
}