我试图转换一些字符串,在法国加拿大,基本上,我想能够拿出法国重音标记在字母,同时保持字母。(例如,将é转换为e,那么crème brûlée就会变成creme brulee)
实现这一目标的最佳方法是什么?
我试图转换一些字符串,在法国加拿大,基本上,我想能够拿出法国重音标记在字母,同时保持字母。(例如,将é转换为e,那么crème brûlée就会变成creme brulee)
实现这一目标的最佳方法是什么?
当前回答
在这里弹出这个库,如果您还没有考虑过的话。看起来有一个完整的单元测试。
https://github.com/thomasgalliker/Diacritics.NET
其他回答
这招对我很管用……
string accentedStr;
byte[] tempBytes;
tempBytes = System.Text.Encoding.GetEncoding("ISO-8859-8").GetBytes(accentedStr);
string asciiStr = System.Text.Encoding.UTF8.GetString(tempBytes);
快速短!
你可以从MMLib中使用字符串扩展。扩展nuget包:
using MMLib.RapidPrototyping.Generators;
public void ExtensionsExample()
{
string target = "aácčeéií";
Assert.AreEqual("aacceeii", target.RemoveDiacritics());
}
Nuget页面:https://www.nuget.org/packages/MMLib.Extensions/ Codeplex项目网站https://mmlib.codeplex.com/
我需要一些东西,转换所有主要的unicode字符和投票的答案留下了一些,所以我已经创建了一个CodeIgniter的convert_accented_characters($str)的版本为c#,很容易自定义:
using System;
using System.Text;
using System.Collections.Generic;
public static class Strings
{
static Dictionary<string, string> foreign_characters = new Dictionary<string, string>
{
{ "äæǽ", "ae" },
{ "öœ", "oe" },
{ "ü", "ue" },
{ "Ä", "Ae" },
{ "Ü", "Ue" },
{ "Ö", "Oe" },
{ "ÀÁÂÃÄÅǺĀĂĄǍΑΆẢẠẦẪẨẬẰẮẴẲẶА", "A" },
{ "àáâãåǻāăąǎªαάảạầấẫẩậằắẵẳặа", "a" },
{ "Б", "B" },
{ "б", "b" },
{ "ÇĆĈĊČ", "C" },
{ "çćĉċč", "c" },
{ "Д", "D" },
{ "д", "d" },
{ "ÐĎĐΔ", "Dj" },
{ "ðďđδ", "dj" },
{ "ÈÉÊËĒĔĖĘĚΕΈẼẺẸỀẾỄỂỆЕЭ", "E" },
{ "èéêëēĕėęěέεẽẻẹềếễểệеэ", "e" },
{ "Ф", "F" },
{ "ф", "f" },
{ "ĜĞĠĢΓГҐ", "G" },
{ "ĝğġģγгґ", "g" },
{ "ĤĦ", "H" },
{ "ĥħ", "h" },
{ "ÌÍÎÏĨĪĬǏĮİΗΉΊΙΪỈỊИЫ", "I" },
{ "ìíîïĩīĭǐįıηήίιϊỉịиыї", "i" },
{ "Ĵ", "J" },
{ "ĵ", "j" },
{ "ĶΚК", "K" },
{ "ķκк", "k" },
{ "ĹĻĽĿŁΛЛ", "L" },
{ "ĺļľŀłλл", "l" },
{ "М", "M" },
{ "м", "m" },
{ "ÑŃŅŇΝН", "N" },
{ "ñńņňʼnνн", "n" },
{ "ÒÓÔÕŌŎǑŐƠØǾΟΌΩΏỎỌỒỐỖỔỘỜỚỠỞỢО", "O" },
{ "òóôõōŏǒőơøǿºοόωώỏọồốỗổộờớỡởợо", "o" },
{ "П", "P" },
{ "п", "p" },
{ "ŔŖŘΡР", "R" },
{ "ŕŗřρр", "r" },
{ "ŚŜŞȘŠΣС", "S" },
{ "śŝşșšſσςс", "s" },
{ "ȚŢŤŦτТ", "T" },
{ "țţťŧт", "t" },
{ "ÙÚÛŨŪŬŮŰŲƯǓǕǗǙǛŨỦỤỪỨỮỬỰУ", "U" },
{ "ùúûũūŭůűųưǔǖǘǚǜυύϋủụừứữửựу", "u" },
{ "ÝŸŶΥΎΫỲỸỶỴЙ", "Y" },
{ "ýÿŷỳỹỷỵй", "y" },
{ "В", "V" },
{ "в", "v" },
{ "Ŵ", "W" },
{ "ŵ", "w" },
{ "ŹŻŽΖЗ", "Z" },
{ "źżžζз", "z" },
{ "ÆǼ", "AE" },
{ "ß", "ss" },
{ "IJ", "IJ" },
{ "ij", "ij" },
{ "Œ", "OE" },
{ "ƒ", "f" },
{ "ξ", "ks" },
{ "π", "p" },
{ "β", "v" },
{ "μ", "m" },
{ "ψ", "ps" },
{ "Ё", "Yo" },
{ "ё", "yo" },
{ "Є", "Ye" },
{ "є", "ye" },
{ "Ї", "Yi" },
{ "Ж", "Zh" },
{ "ж", "zh" },
{ "Х", "Kh" },
{ "х", "kh" },
{ "Ц", "Ts" },
{ "ц", "ts" },
{ "Ч", "Ch" },
{ "ч", "ch" },
{ "Ш", "Sh" },
{ "ш", "sh" },
{ "Щ", "Shch" },
{ "щ", "shch" },
{ "ЪъЬь", "" },
{ "Ю", "Yu" },
{ "ю", "yu" },
{ "Я", "Ya" },
{ "я", "ya" },
};
public static char RemoveDiacritics(this char c){
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
return entry.Value[0];
}
}
return c;
}
public static string RemoveDiacritics(this string s)
{
//StringBuilder sb = new StringBuilder ();
string text = "";
foreach (char c in s)
{
int len = text.Length;
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
text += entry.Value;
break;
}
}
if (len == text.Length) {
text += c;
}
}
return text;
}
}
使用
// for strings
"crème brûlée".RemoveDiacritics (); // creme brulee
// for chars
"Ã"[0].RemoveDiacritics (); // A
公认的答案是完全正确的,但是现在,它应该更新为使用符文类而不是CharUnicodeInfo,因为c#和。net在最新版本中更新了分析字符串的方法(符文类已在。net Core 3.0中添加)。
下面的代码现在推荐用于。net 5+,因为它可以进一步用于非拉丁字符:
static string RemoveDiacritics(string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString.EnumerateRunes())
{
var unicodeCategory = Rune.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
这就是我如何在所有的。net程序中替换变音符字符为非变音符字符
C#:
//Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter 'é' is substituted by an 'e'
public string RemoveDiacritics(string s)
{
string normalizedString = null;
StringBuilder stringBuilder = new StringBuilder();
normalizedString = s.Normalize(NormalizationForm.FormD);
int i = 0;
char c = '\0';
for (i = 0; i <= normalizedString.Length - 1; i++)
{
c = normalizedString[i];
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().ToLower();
}
VB .NET:
'Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter "é" is substituted by an "e"'
Public Function RemoveDiacritics(ByVal s As String) As String
Dim normalizedString As String
Dim stringBuilder As New StringBuilder
normalizedString = s.Normalize(NormalizationForm.FormD)
Dim i As Integer
Dim c As Char
For i = 0 To normalizedString.Length - 1
c = normalizedString(i)
If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
stringBuilder.Append(c)
End If
Next
Return stringBuilder.ToString().ToLower()
End Function