commit a25cbd6ddea80793aebd3d33f7a8bf8534b042b8 Author: Eveldee Date: Tue May 13 21:07:53 2025 +0200 Add dictionary generation diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e37216c --- /dev/null +++ b/.gitignore @@ -0,0 +1,403 @@ +# Created by https://www.toptal.com/developers/gitignore/api/csharp +# Edit at https://www.toptal.com/developers/gitignore?templates=csharp + +### Csharp ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml + +# End of https://www.toptal.com/developers/gitignore/api/csharp +.idea diff --git a/PoyoLang.Analysis.NGrams/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx b/PoyoLang.Analysis.NGrams/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx new file mode 100644 index 0000000..3c3a0de Binary files /dev/null and b/PoyoLang.Analysis.NGrams/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx differ diff --git a/PoyoLang.Analysis.NGrams/PoyoLang.Analysis.NGrams.csproj b/PoyoLang.Analysis.NGrams/PoyoLang.Analysis.NGrams.csproj new file mode 100644 index 0000000..d618101 --- /dev/null +++ b/PoyoLang.Analysis.NGrams/PoyoLang.Analysis.NGrams.csproj @@ -0,0 +1,19 @@ + + + + Exe + net9.0 + enable + enable + + + + + + + + + + + + diff --git a/PoyoLang.Analysis.NGrams/Program.cs b/PoyoLang.Analysis.NGrams/Program.cs new file mode 100644 index 0000000..577098b --- /dev/null +++ b/PoyoLang.Analysis.NGrams/Program.cs @@ -0,0 +1,105 @@ +// See https://aka.ms/new-console-template for more information + +// Load up the file + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Text.Json; +using ClosedXML.Excel; + +Console.WriteLine("Reading Excel file..."); + +// Load Excel data file +const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx"; + +using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath); + +using var workBook = new XLWorkbook(excelFileStream); +var worksheet = workBook.Worksheet("List"); + +Console.WriteLine("Reading word frequencies"); + +// Read word frequencies +var wordColumn = "C"; +var frequencyColumn = "D"; + +var wordFrequencies = new List<(string word, long frequency)>(); + +var row = 2; + +while (true) +{ + var wordValue = worksheet.Cell(row, wordColumn).Value; + + if (wordValue.IsBlank) + { + break; + } + + var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} "; + var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble(); + + if (!word.Contains('(')) + { + wordFrequencies.Add((word, frequency)); + } + + row++; +} + +Console.WriteLine("Computing ngrams"); + +// Compute n-grams +var ngrams = new Dictionary(); + +const int MaxLength = 5; +const int MinLength = 1; + +foreach (var (word, frequency) in wordFrequencies) +{ + var span = word.AsSpan(); + + while (span.Length >= MinLength) + { + for (int length = MinLength; length <= MaxLength; length++) + { + if (length > span.Length) + { + break; + } + + Increment(span[..length]); + } + + span = span[1..]; + } + + continue; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void Increment(ReadOnlySpan span) + { + var ngram = span.ToString(); + + if (ngrams.TryGetValue(ngram, out var count)) + { + ngrams[ngram] = count + frequency; + } + else + { + ngrams[ngram] = frequency; + } + } +} + +// Order frequencies +var orderedNgrams = ngrams + .OrderByDescending(n => n.Value) + .ToList(); + +Console.WriteLine($"Found {orderedNgrams.Count} n-grams"); + +var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true}); + +await File.WriteAllTextAsync("n-grams.json", serializedNgrams); + diff --git a/PoyoLang.Dictionary.Generation/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx b/PoyoLang.Dictionary.Generation/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx new file mode 100644 index 0000000..3c3a0de Binary files /dev/null and b/PoyoLang.Dictionary.Generation/60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx differ diff --git a/PoyoLang.Dictionary.Generation/PoyoLang.Dictionary.Generation.csproj b/PoyoLang.Dictionary.Generation/PoyoLang.Dictionary.Generation.csproj new file mode 100644 index 0000000..2ee3bc6 --- /dev/null +++ b/PoyoLang.Dictionary.Generation/PoyoLang.Dictionary.Generation.csproj @@ -0,0 +1,22 @@ + + + + Exe + net9.0 + enable + enable + + + + + + + + + + + + + + + diff --git a/PoyoLang.Dictionary.Generation/Program.cs b/PoyoLang.Dictionary.Generation/Program.cs new file mode 100644 index 0000000..d016052 --- /dev/null +++ b/PoyoLang.Dictionary.Generation/Program.cs @@ -0,0 +1,124 @@ +// See https://aka.ms/new-console-template for more information + +// Load up the file + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Text.Json; +using ClosedXML.Excel; +using PoyoLang.Dictionary; + +const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx"; + +JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true }; + +Console.WriteLine("Reading Excel file..."); + +// Load Excel data file +await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath); + +using var workBook = new XLWorkbook(excelFileStream); +var worksheet = workBook.Worksheet("List"); + +Console.WriteLine("Reading word frequencies"); + +// Read word frequencies +var wordColumn = "C"; +var frequencyColumn = "D"; + +var wordFrequencies = new List<(string word, long frequency)>(); + +var row = 2; + +while (true) +{ + var wordValue = worksheet.Cell(row, wordColumn).Value; + + if (wordValue.IsBlank) + { + break; + } + + var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} "; + var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble(); + + if (!word.Contains('(')) + { + wordFrequencies.Add((word, frequency)); + } + + row++; +} + +Console.WriteLine("Computing ngrams"); + +// Compute n-grams +var ngrams = new Dictionary(); + +const int MaxLength = 8; +const int MinLength = 1; + +foreach (var (word, frequency) in wordFrequencies) +{ + var span = word.AsSpan(); + + while (span.Length >= MinLength) + { + for (int length = MinLength; length <= MaxLength; length++) + { + if (length > span.Length) + { + break; + } + + Increment(span[..length]); + } + + span = span[1..]; + } + + continue; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void Increment(ReadOnlySpan span) + { + var ngram = span.ToString(); + + if (ngrams.TryGetValue(ngram, out var count)) + { + ngrams[ngram] = count + frequency; + } + else + { + ngrams[ngram] = frequency; + } + } +} + +// Order frequencies +var orderedNgrams = ngrams + .OrderByDescending(n => n.Value) + .ToList(); + +Console.WriteLine($"Found {orderedNgrams.Count} n-grams"); + +var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions); + +await File.WriteAllTextAsync("n-grams.json", serializedNgrams); + +Console.WriteLine("Generating dictionary..."); + +// Generate dictionary +var dictionary = new Dictionary(); +var ngramIndex = 0; + +foreach (var letter in Alphabet.BaseAlphabet) +{ + dictionary[letter] = orderedNgrams[ngramIndex].Key; + + ngramIndex++; +} + +await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions)); + +Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}"); \ No newline at end of file diff --git a/PoyoLang.Dictionary/Alphabet.cs b/PoyoLang.Dictionary/Alphabet.cs new file mode 100644 index 0000000..b626cb4 --- /dev/null +++ b/PoyoLang.Dictionary/Alphabet.cs @@ -0,0 +1,44 @@ +namespace PoyoLang.Dictionary; + +public static class Alphabet +{ + public static char[] OVariations { get; } = + [ + 'o', + 'ó', + 'ò', + 'ô', + 'ö', + 'õ', + 'ō', + 'ǒ' + ]; + + public static char[] VowelVariations { get; } = + [ + + // o + 'o', 'ó', 'ò', 'ô', 'ö', 'õ', 'ō', 'ǒ', + + // a + 'a', 'á', 'à', 'â', 'ä', 'ã', 'ā', 'ǎ', + + // i + 'i', 'í', 'ì', 'î', 'ï', 'ĩ', 'ī', 'ǐ', + + // u + 'u', 'ú', 'ù', 'û', 'ü', 'ũ', 'ū', 'ǔ', + + // e + 'e', 'é', 'è', 'ê', 'ë', 'ẽ', 'ē', 'ě' + ]; + + public static string[] BaseAlphabet { get; } = VowelVariations + .Select(leftVowel => + OVariations.Select(rightVowel => $"p{leftVowel}y{rightVowel}") + ) + .SelectMany(x => x) + .ToArray(); + + +} \ No newline at end of file diff --git a/PoyoLang.Dictionary/PoyoLang.Dictionary.csproj b/PoyoLang.Dictionary/PoyoLang.Dictionary.csproj new file mode 100644 index 0000000..17b910f --- /dev/null +++ b/PoyoLang.Dictionary/PoyoLang.Dictionary.csproj @@ -0,0 +1,9 @@ + + + + net9.0 + enable + enable + + + diff --git a/PoyoLang.Test/PoyoLang.Test.csproj b/PoyoLang.Test/PoyoLang.Test.csproj new file mode 100644 index 0000000..491aa7e --- /dev/null +++ b/PoyoLang.Test/PoyoLang.Test.csproj @@ -0,0 +1,14 @@ + + + + Exe + net9.0 + enable + enable + + + + + + + diff --git a/PoyoLang.Test/Program.cs b/PoyoLang.Test/Program.cs new file mode 100644 index 0000000..6334d20 --- /dev/null +++ b/PoyoLang.Test/Program.cs @@ -0,0 +1,9 @@ +// See https://aka.ms/new-console-template for more information + +using PoyoLang.Dictionary; + +Console.OutputEncoding = System.Text.Encoding.UTF8; + +Console.WriteLine(string.Join(Environment.NewLine, Alphabet.BaseAlphabet)); + +Console.WriteLine(Alphabet.BaseAlphabet.Length); \ No newline at end of file diff --git a/PoyoLang.sln b/PoyoLang.sln new file mode 100644 index 0000000..0e633e8 --- /dev/null +++ b/PoyoLang.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Dictionary", "PoyoLang.Dictionary\PoyoLang.Dictionary.csproj", "{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Test", "PoyoLang.Test\PoyoLang.Test.csproj", "{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Dictionary.Generation", "PoyoLang.Dictionary.Generation\PoyoLang.Dictionary.Generation.csproj", "{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Release|Any CPU.Build.0 = Release|Any CPU + {4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Release|Any CPU.Build.0 = Release|Any CPU + {43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Debug|Any CPU.Build.0 = Debug|Any CPU + {43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.ActiveCfg = Release|Any CPU + {43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal