Compare commits

...

3 Commits

Author SHA1 Message Date
f39d936024 Add translator from poyo 2025-05-16 14:07:02 +02:00
f83f59c05a Add translator to poyo 2025-05-16 11:27:16 +02:00
871f46b996 Add translator source generator 2025-05-15 21:03:35 +02:00
14 changed files with 847 additions and 130 deletions

View File

@@ -1,19 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<None Remove="60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx" />
<EmbeddedResource Include="60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="ClosedXML" Version="0.104.2" />
</ItemGroup>
</Project>

View File

@@ -1,105 +0,0 @@
// See https://aka.ms/new-console-template for more information
// Load up the file
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text.Json;
using ClosedXML.Excel;
Console.WriteLine("Reading Excel file...");
// Load Excel data file
const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
using var workBook = new XLWorkbook(excelFileStream);
var worksheet = workBook.Worksheet("List");
Console.WriteLine("Reading word frequencies");
// Read word frequencies
var wordColumn = "C";
var frequencyColumn = "D";
var wordFrequencies = new List<(string word, long frequency)>();
var row = 2;
while (true)
{
var wordValue = worksheet.Cell(row, wordColumn).Value;
if (wordValue.IsBlank)
{
break;
}
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
if (!word.Contains('('))
{
wordFrequencies.Add((word, frequency));
}
row++;
}
Console.WriteLine("Computing ngrams");
// Compute n-grams
var ngrams = new Dictionary<string, long>();
const int MaxLength = 5;
const int MinLength = 1;
foreach (var (word, frequency) in wordFrequencies)
{
var span = word.AsSpan();
while (span.Length >= MinLength)
{
for (int length = MinLength; length <= MaxLength; length++)
{
if (length > span.Length)
{
break;
}
Increment(span[..length]);
}
span = span[1..];
}
continue;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
void Increment(ReadOnlySpan<char> span)
{
var ngram = span.ToString();
if (ngrams.TryGetValue(ngram, out var count))
{
ngrams[ngram] = count + frequency;
}
else
{
ngrams[ngram] = frequency;
}
}
}
// Order frequencies
var orderedNgrams = ngrams
.OrderByDescending(n => n.Value)
.ToList();
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true});
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);

View File

@@ -121,4 +121,14 @@ foreach (var letter in Alphabet.BaseAlphabet)
await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions));
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");
// Also write in simple custom format for source generator
await using var customOutput = File.CreateText("dictionary.txt");
foreach (var pair in dictionary)
{
await customOutput.WriteLineAsync($"{pair.Key}={pair.Value}");
}
Console.WriteLine($"Custom dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.txt")}");

View File

@@ -8,7 +8,7 @@
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\PoyoLang.Dictionary\PoyoLang.Dictionary.csproj" />
<ProjectReference Include="..\PoyoLang.Translator\PoyoLang.Translator.csproj" />
</ItemGroup>
</Project>

View File

@@ -1,9 +1,25 @@
// See https://aka.ms/new-console-template for more information
using PoyoLang.Dictionary;
using System.Text;
using PoyoLang.Translator;
Console.OutputEncoding = System.Text.Encoding.UTF8;
Console.OutputEncoding = Encoding.UTF8;
Console.WriteLine(string.Join(Environment.NewLine, Alphabet.BaseAlphabet));
var text = "Immutable abstract representation of a span of text. For example, in an error diagnostic that reports a location, it could come from a parsed string, text from a tool editor buffer, etc.";
Console.WriteLine(Alphabet.BaseAlphabet.Length);
Console.WriteLine("Original:");
Console.WriteLine(text);
Console.WriteLine();
var translator = new PoyoLangTranslator();
var translated = translator.TranslateToPoyo(text);
Console.WriteLine("Translated to Poyo:");
Console.WriteLine(translated);
Console.WriteLine();
var original = translator.TranslateFromPoyo(translated);
Console.WriteLine("Translated back from Poyo:");
Console.WriteLine(original);

View File

@@ -0,0 +1,15 @@
namespace PoyoLang.Translator.SourceGenerator;
public class Node
{
public char Letter { get; }
public string Target { get; }
public List<Node> Nodes { get; } = [];
public Node(char letter, string target)
{
Letter = letter;
Target = target;
}
}

View File

@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>latest</LangVersion>
<EnforceExtendedAnalyzerRules>true</EnforceExtendedAnalyzerRules>
<IsRoslynComponent>true</IsRoslynComponent>
<EmitCompilerGeneratedFiles>true</EmitCompilerGeneratedFiles>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.CodeAnalysis.Analyzers" Version="3.11.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.CodeAnalysis.CSharp" Version="4.12.0"/>
<PackageReference Include="PolySharp" Version="1.15.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>
</Project>

View File

@@ -0,0 +1,377 @@
using System.Text;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;
namespace PoyoLang.Translator.SourceGenerator;
[Generator]
public class PoyoLangTranslatorGenerator : IIncrementalGenerator
{
private const char IndentChar = '\t';
public void Initialize(IncrementalGeneratorInitializationContext context)
{
var texts = context.AdditionalTextsProvider;
// There will be only one of those but incremental generators work as pipelines
var dictionaries = texts
.Where(static text => text.Path.EndsWith("dictionary.txt"))
.Select(static (text, _) => text.GetText());
var parsedDictionaries = dictionaries
.Select(static (dictionary, _) =>
ReadCustomDictionary(dictionary!)
);
var formattedDictionaries = parsedDictionaries
.Select(static (dictionary, _) =>
{
// Return normal and reverse dictionary order to have ngrams first
return (
Normal: dictionary,
Reversed: dictionary!.OrderBy(p => p.Value).ToDictionary(p => p.Value, p => p.Key)
);
});
var prefixTrees = formattedDictionaries
.Select(static (dictionaries, _) => (
Dictionary: dictionaries.Normal,
PrefixTree: BuildPrefixTree(dictionaries.Reversed)
));
context.RegisterSourceOutput(prefixTrees, static (sourceProductionContext, data) =>
{
sourceProductionContext.AddSource("PoyoLangTranslator.g.cs", GenerateSource(data.Dictionary, data.PrefixTree));
});
}
private static Dictionary<string, string> ReadCustomDictionary(SourceText text)
{
var dictionary = new Dictionary<string, string>();
foreach (var line in text.ToString().Split('\n'))
{
var span = line.TrimEnd('\r').AsSpan();
// Reached end of file
if (span.Length < 1)
{
break;
}
var splitIndex = span.IndexOf('=');
dictionary[span[..splitIndex].ToString()] = span[(splitIndex + 1)..].ToString();
}
return dictionary;
}
private static List<Node> BuildPrefixTree(Dictionary<string, string> dictionary)
{
var rootNodes = new List<Node>();
var firstNodes = dictionary.Where(p => p.Key.Length is 1);
foreach (var firstNode in firstNodes)
{
var letter = firstNode.Key[0];
var target = firstNode.Value;
var node = new Node(letter, target);
rootNodes.Add(node);
// Add sub-nodes
ParseNodes(node, letter.ToString());
}
return rootNodes;
void ParseNodes(Node node, string prefix)
{
// Find nodes that have previous node as prefixed
var subNodes = dictionary
.Where(p => p.Key.StartsWith(prefix) && p.Key.Length == prefix.Length + 1);
foreach (var subNode in subNodes)
{
var letter = subNode.Key[prefix.Length];
var target = subNode.Value;
var newPrefix = $"{prefix}{letter}";
var newNode = new Node(letter, target);
node.Nodes.Add(newNode);
// Recursively add sub-nodes
ParseNodes(newNode, newPrefix);
}
}
}
private static string GenerateSource(Dictionary<string, string> dictionary, List<Node> rootNodes)
{
var source = new StringBuilder();
// Usings and namespace
source.Append(
"""
using System;
using System.Text;
namespace PoyoLang.Translator;
"""
);
// Partial class definition
source.Append(
"""
public partial class PoyoLangTranslator
{
"""
);
GenerateNextLetterMethod(rootNodes, source);
GenerateFromPoyoMethod(dictionary, source);
// Partial class end
source.Append(
"""
}
"""
);
return source.ToString();
}
private static void GenerateNextLetterMethod(List<Node> rootNodes, StringBuilder source)
{
// Next letter method definition
source.Append(
"""
private void NextLetter(ref ReadOnlySpan<char> text, StringBuilder output)
{
"""
);
// 0 length case and caps
source.Append(
"""
if (text.Length < 1)
{
return;
}
var isCaps = char.IsUpper(text[0]);
"""
);
GenerateSwitchCases(rootNodes, depth: 0);
// Next letter method end
source.Append(
"""
// Punctuation/Unknown characters case
output.Append(text[0]);
text = text[1..];
}
"""
);
return;
void GenerateSwitchCases(List<Node> nodes, int depth)
{
var indent = Indent(depth * 3);
// Switch-case start
source.Append(
$$"""
{{indent}}switch (text[{{depth}}])
{{indent}}{
"""
);
foreach (var node in nodes)
{
var targetLower = node.Target;
var targetUpper = ToTitleCase(targetLower);
// Case start
source.Append(
$$"""
{{indent}} case '{{node.Letter}}' or '{{char.ToUpper(node.Letter)}}':
"""
);
// Sub nodes handling
if (node.Nodes.Count > 0)
{
source.Append(
$$"""
{{indent}} if (text.Length > {{depth + 1}})
{{indent}} {
"""
);
// Sub nodes
GenerateSwitchCases(node.Nodes, depth + 1);
source.Append(
$$"""
{{indent}} }
"""
);
}
// Current node handling fallback
source.Append(
$$"""
{{indent}}
{{indent}} text = text[{{depth + 1}}..];
{{indent}}
{{indent}} output.Append(isCaps ? "{{targetUpper}}" : "{{targetLower}}");
{{indent}}
{{indent}} return;
"""
);
}
// Switch-case end
source.Append(
$$"""
{{indent}}}
"""
);
}
}
private static void GenerateFromPoyoMethod(Dictionary<string, string> dictionary, StringBuilder source)
{
// From Poyo method definition
source.Append(
"""
private void FromPoyo(ref ReadOnlySpan<char> text, StringBuilder output)
{
"""
);
// Initial cases
source.Append(
"""
if (text.Length < 1)
{
return;
}
// This happens if the end of the text is not a poyo letter (punctuation for ex)
if (text.Length < 4)
{
output.Append(text);
text = text[^0..];
return;
}
var letter = text[..4];
"""
);
GenerateReverseSwitchCases();
// From Poyo method end
source.Append(
"""
// Advance in text
text = text[4..];
}
"""
);
return;
void GenerateReverseSwitchCases()
{
// Switch start
source.Append(
"""
switch (letter)
{
"""
);
foreach (var pair in dictionary)
{
// Non-caps case
source.Append(
$$"""
case "{{pair.Key}}":
output.Append("{{pair.Value}}");
break;
"""
);
// Caps case
source.Append(
$$"""
case "{{ToTitleCase(pair.Key)}}":
output.Append("{{ToTitleCase(pair.Value)}}");
break;
"""
);
}
// Switch end
source.Append(
"""
default:
// Not a poyo letter, only read 1 character (could be punctuation for ex)
output.Append(text[0]);
text = text[1..];
return;
}
"""
);
}
}
private static string ToTitleCase(string text)
{
return $"{char.ToUpper(text[0])}{text[1..]}";
}
private static string Indent(int depth) => new(IndentChar, depth);
}

View File

@@ -0,0 +1,9 @@
{
"$schema": "http://json.schemastore.org/launchsettings.json",
"profiles": {
"PoyoLang.Translator.SourceGenerator": {
"commandName": "DebugRoslynComponent",
"targetProject": "../PoyoLang.Test/PoyoLang.Test.csproj"
}
}
}

View File

@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\PoyoLang.Translator.SourceGenerator\PoyoLang.Translator.SourceGenerator.csproj" OutputItemType="Analyzer" ReferenceOutputAssembly="false" />
</ItemGroup>
<ItemGroup>
<AdditionalFiles Include="dictionary.txt" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,39 @@
using System.Text;
namespace PoyoLang.Translator;
public partial class PoyoLangTranslator
{
public string TranslateToPoyo(ReadOnlySpan<char> text)
{
var output = new StringBuilder(text.Length);
while (text.Length > 0)
{
NextLetter(ref text, output);
// Add space if not reached the end
if (text.Length > 0)
{
output.Append(' ');
}
}
return output.ToString();
}
public string TranslateFromPoyo(ReadOnlySpan<char> text)
{
var output = new StringBuilder(text.Length);
while (text.Length > 0)
{
// Skip spaces (those are not used in this language)
text = text.TrimStart(' ');
FromPoyo(ref text, output);
}
return output.ToString();
}
}

View File

@@ -0,0 +1,320 @@
poyo=
poyó=e
poyò=t
poyô=a
poyö=o
poyõ=i
poyō=n
poyǒ=e
póyo=r
póyó=h
póyò=s
póyô=l
póyö=d
póyõ=c
póyō=th
póyǒ=u
pòyo=t
pòyó=he
pòyò=m
pòyô=b
pòyö=f
pòyõ=y
pòyō=p
pòyǒ=the
pôyo=n
pôyó=w
pôyò=g
pôyô=he
pôyö=in
pôyõ=y
pôyō=d
pôyǒ=r
pöyo=an
pöyó=er
pöyò=the
pöyô=be
pöyö=at
pöyõ=re
pöyō=v
pöyǒ=on
põyo=o
põyó=nd
põyò=or
põyô=be
põyö=ha
põyõ=en
põyō=to
põyǒ=ve
pōyo=ou
pōyó=nd
pōyò=it
pōyô=st
pōyö=l
pōyõ=k
pōyō=te
pōyǒ=al
pǒyo=ti
pǒyó=f
pǒyò=and
pǒyô=s
pǒyö=er
pǒyõ=nt
pǒyō=and
pǒyǒ=of
payo=ar
payó=a
payò=se
payô=to
payö=ea
payõ=hi
payō=of
payǒ=me
páyo=le
páyó=on
páyò=h
páyô=co
páyö=is
páyõ=in
páyō=at
páyǒ=ro
pàyo=ll
pàyó=ve
pàyò=de
pàyô=es
pàyö=ng
pàyõ=io
pàyō=om
pàyǒ=ne
pâyo=ic
pâyó=li
pâyò=ri
pâyô=ra
pâyö=as
pâyõ=ce
pâyō=g
pâyǒ=ho
päyo=ion
päyó=ca
päyò=or
päyô=ta
päyö=ut
päyõ=el
päyō=ch
päyǒ=m
pãyo=hat
pãyó=ma
pãyò=hat
pãyô=ur
pãyö=k
pãyõ=ng
pãyō=fo
pãyǒ=re
pāyo=no
pāyó=si
pāyò=her
pāyô=av
pāyö=nt
pāyõ=tha
pāyō=ion
pāyǒ=il
pǎyo=ent
pǎyó=et
pǎyò=la
pǎyô=us
pǎyö=ac
pǎyõ=ly
pǎyō=ing
pǎyǒ=wh
piyo=ow
piyó=ave
piyò=pe
piyô=ec
piyö=ly
piyõ=ot
piyō=tio
piyǒ=ll
píyo=tion
píyó=wi
píyò=ave
píyô=se
píyö=al
píyõ=ing
píyō=ge
píyǒ=it
pìyo=so
pìyó=that
pìyò=that
pìyô=for
pìyö=ay
pìyõ=st
pìyō=lo
pìyǒ=pr
pîyo=ee
pîyó=hav
pîyò=have
pîyô=have
pîyö=tr
pîyõ=sh
pîyō=le
pîyǒ=w
pïyo=mo
pïyó=an
pïyò=tion
pïyô=ut
pïyö=un
pïyõ=ce
pïyō=ct
pïyǒ=ay
pĩyo=me
pĩyó=di
pĩyò=ss
pĩyô=ed
pĩyö=i
pĩyõ=we
pĩyō=ol
pĩyǒ=yo
pīyo=ul
pīyó=rt
pīyò=te
pīyô=em
pīyö=th
pīyõ=ter
pīyō=do
pīyǒ=ke
pǐyo=po
pǐyó=ir
pǐyò=thi
pǐyô=nc
pǐyö=you
pǐyõ=his
pǐyō=im
pǐyǒ=is
puyo=oo
puyó=all
puyò=ent
puyô=ig
puyö=pa
puyõ=ate
puyō=p
puyǒ=ati
púyo=ld
púyó=fi
púyò=his
púyô=en
púyö=ver
púyõ=na
púyō=mi
púyǒ=ry
pùyo=ai
pùyó=pl
pùyò=ow
pùyô=gh
pùyö=wo
pùyõ=sa
pùyō=ad
pùyǒ=her
pûyo=ld
pûyó=ev
pûyò=su
pûyô=os
pûyö=iv
pûyõ=for
pûyō=ther
pûyǒ=wa
püyo=ni
püyó=ry
püyò=ith
püyô=am
püyö=bo
püyõ=u
püyō=ch
püyǒ=ab
pũyo=ou
pũyó=you
pũyò=op
pũyô=id
pũyö=wit
pũyõ=ne
pũyō=bu
pũyǒ=with
pūyo=fe
pūyó=tu
pūyò=bl
pūyô=ere
pūyö=atio
pūyõ=x
pūyō=ed
pūyǒ=ation
pǔyo=ome
pǔyó=out
pǔyò=con
pǔyô=ke
pǔyö=ns
pǔyõ=rea
pǔyō=eve
pǔyǒ=ci
peyo=ie
peyó=com
peyò=ar
peyô=et
peyö=ith
peyõ=vi
peyō=ty
peyǒ=with
péyo=ear
péyó=fr
péyò=if
péyô=ag
péyö=res
péyõ=ate
péyō=do
péyǒ=mp
pèyo=ey
pèyó=ive
pèyò=ia
pèyô=pro
pèyö=ba
pèyõ=ov
pèyō=nce
pèyǒ=as
pêyo=ck
pêyó=sta
pêyò=sp
pêyô=ty
pêyö=gr
pêyõ=ter
pêyō=ation
pêyǒ=hin
pëyo=ess
pëyó=ak
pëyò=ge
pëyô=ill
pëyö=go
pëyõ=out
pëyō=our
pëyǒ=ot
pẽyo=ey
pẽyó=fa
pẽyò=ss
pẽyô=igh
pẽyö=not
pẽyõ=int
pẽyō=ex
pẽyǒ=j
pēyo=om
pēyó=one
pēyò=ap
pēyô=men
pēyö=all
pēyõ=od
pēyō=here
pēyǒ=est
pěyo=up
pěyó=ive
pěyò=rs
pěyô=ere
pěyö=ove
pěyõ=nce
pěyō=ide
pěyǒ=uc

View File

@@ -6,6 +6,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Test", "PoyoLang.T
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Dictionary.Generation", "PoyoLang.Dictionary.Generation\PoyoLang.Dictionary.Generation.csproj", "{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Translator.SourceGenerator", "PoyoLang.Translator.SourceGenerator\PoyoLang.Translator.SourceGenerator.csproj", "{0411CE3E-B80E-4AC3-839F-307AD0A16774}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Translator", "PoyoLang.Translator\PoyoLang.Translator.csproj", "{079808D0-16FB-4D01-A502-5366018312CB}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -24,5 +28,13 @@ Global
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Debug|Any CPU.Build.0 = Debug|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.ActiveCfg = Release|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.Build.0 = Release|Any CPU
{0411CE3E-B80E-4AC3-839F-307AD0A16774}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{0411CE3E-B80E-4AC3-839F-307AD0A16774}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0411CE3E-B80E-4AC3-839F-307AD0A16774}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0411CE3E-B80E-4AC3-839F-307AD0A16774}.Release|Any CPU.Build.0 = Release|Any CPU
{079808D0-16FB-4D01-A502-5366018312CB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{079808D0-16FB-4D01-A502-5366018312CB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{079808D0-16FB-4D01-A502-5366018312CB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{079808D0-16FB-4D01-A502-5366018312CB}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal