Add dictionary generation

This commit is contained in:
2025-05-13 21:07:53 +02:00
commit a25cbd6dde
12 changed files with 777 additions and 0 deletions

403
.gitignore vendored Normal file
View File

@@ -0,0 +1,403 @@
# Created by https://www.toptal.com/developers/gitignore/api/csharp
# Edit at https://www.toptal.com/developers/gitignore?templates=csharp
### Csharp ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.log
*.tlog
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio 6 auto-generated project file (contains which files were open etc.)
*.vbp
# Visual Studio 6 workspace and project file (working project files containing files to include in project)
*.dsw
*.dsp
# Visual Studio 6 technical files
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# Visual Studio History (VSHistory) files
.vshistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
# VS Code files for those working on multiple tools
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
# Local History for Visual Studio Code
.history/
# Windows Installer files from build outputs
*.cab
*.msi
*.msix
*.msm
*.msp
# JetBrains Rider
*.sln.iml
# End of https://www.toptal.com/developers/gitignore/api/csharp
.idea

View File

@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<None Remove="60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx" />
<EmbeddedResource Include="60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="ClosedXML" Version="0.104.2" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,105 @@
// See https://aka.ms/new-console-template for more information
// Load up the file
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text.Json;
using ClosedXML.Excel;
Console.WriteLine("Reading Excel file...");
// Load Excel data file
const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
using var workBook = new XLWorkbook(excelFileStream);
var worksheet = workBook.Worksheet("List");
Console.WriteLine("Reading word frequencies");
// Read word frequencies
var wordColumn = "C";
var frequencyColumn = "D";
var wordFrequencies = new List<(string word, long frequency)>();
var row = 2;
while (true)
{
var wordValue = worksheet.Cell(row, wordColumn).Value;
if (wordValue.IsBlank)
{
break;
}
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
if (!word.Contains('('))
{
wordFrequencies.Add((word, frequency));
}
row++;
}
Console.WriteLine("Computing ngrams");
// Compute n-grams
var ngrams = new Dictionary<string, long>();
const int MaxLength = 5;
const int MinLength = 1;
foreach (var (word, frequency) in wordFrequencies)
{
var span = word.AsSpan();
while (span.Length >= MinLength)
{
for (int length = MinLength; length <= MaxLength; length++)
{
if (length > span.Length)
{
break;
}
Increment(span[..length]);
}
span = span[1..];
}
continue;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
void Increment(ReadOnlySpan<char> span)
{
var ngram = span.ToString();
if (ngrams.TryGetValue(ngram, out var count))
{
ngrams[ngram] = count + frequency;
}
else
{
ngrams[ngram] = frequency;
}
}
}
// Order frequencies
var orderedNgrams = ngrams
.OrderByDescending(n => n.Value)
.ToList();
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true});
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);

View File

@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<EmbeddedResource Include="60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="ClosedXML" Version="0.104.2" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\PoyoLang.Dictionary\PoyoLang.Dictionary.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,124 @@
// See https://aka.ms/new-console-template for more information
// Load up the file
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text.Json;
using ClosedXML.Excel;
using PoyoLang.Dictionary;
const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true };
Console.WriteLine("Reading Excel file...");
// Load Excel data file
await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
using var workBook = new XLWorkbook(excelFileStream);
var worksheet = workBook.Worksheet("List");
Console.WriteLine("Reading word frequencies");
// Read word frequencies
var wordColumn = "C";
var frequencyColumn = "D";
var wordFrequencies = new List<(string word, long frequency)>();
var row = 2;
while (true)
{
var wordValue = worksheet.Cell(row, wordColumn).Value;
if (wordValue.IsBlank)
{
break;
}
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
if (!word.Contains('('))
{
wordFrequencies.Add((word, frequency));
}
row++;
}
Console.WriteLine("Computing ngrams");
// Compute n-grams
var ngrams = new Dictionary<string, long>();
const int MaxLength = 8;
const int MinLength = 1;
foreach (var (word, frequency) in wordFrequencies)
{
var span = word.AsSpan();
while (span.Length >= MinLength)
{
for (int length = MinLength; length <= MaxLength; length++)
{
if (length > span.Length)
{
break;
}
Increment(span[..length]);
}
span = span[1..];
}
continue;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
void Increment(ReadOnlySpan<char> span)
{
var ngram = span.ToString();
if (ngrams.TryGetValue(ngram, out var count))
{
ngrams[ngram] = count + frequency;
}
else
{
ngrams[ngram] = frequency;
}
}
}
// Order frequencies
var orderedNgrams = ngrams
.OrderByDescending(n => n.Value)
.ToList();
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions);
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
Console.WriteLine("Generating dictionary...");
// Generate dictionary
var dictionary = new Dictionary<string, string>();
var ngramIndex = 0;
foreach (var letter in Alphabet.BaseAlphabet)
{
dictionary[letter] = orderedNgrams[ngramIndex].Key;
ngramIndex++;
}
await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions));
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");

View File

@@ -0,0 +1,44 @@
namespace PoyoLang.Dictionary;
public static class Alphabet
{
public static char[] OVariations { get; } =
[
'o',
'ó',
'ò',
'ô',
'ö',
'õ',
'ō',
'ǒ'
];
public static char[] VowelVariations { get; } =
[
// o
'o', 'ó', 'ò', 'ô', 'ö', 'õ', 'ō', 'ǒ',
// a
'a', 'á', 'à', 'â', 'ä', 'ã', 'ā', 'ǎ',
// i
'i', 'í', 'ì', 'î', 'ï', 'ĩ', 'ī', 'ǐ',
// u
'u', 'ú', 'ù', 'û', 'ü', 'ũ', 'ū', 'ǔ',
// e
'e', 'é', 'è', 'ê', 'ë', 'ẽ', 'ē', 'ě'
];
public static string[] BaseAlphabet { get; } = VowelVariations
.Select(leftVowel =>
OVariations.Select(rightVowel => $"p{leftVowel}y{rightVowel}")
)
.SelectMany(x => x)
.ToArray();
}

View File

@@ -0,0 +1,9 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\PoyoLang.Dictionary\PoyoLang.Dictionary.csproj" />
</ItemGroup>
</Project>

9
PoyoLang.Test/Program.cs Normal file
View File

@@ -0,0 +1,9 @@
// See https://aka.ms/new-console-template for more information
using PoyoLang.Dictionary;
Console.OutputEncoding = System.Text.Encoding.UTF8;
Console.WriteLine(string.Join(Environment.NewLine, Alphabet.BaseAlphabet));
Console.WriteLine(Alphabet.BaseAlphabet.Length);

28
PoyoLang.sln Normal file
View File

@@ -0,0 +1,28 @@

Microsoft Visual Studio Solution File, Format Version 12.00
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Dictionary", "PoyoLang.Dictionary\PoyoLang.Dictionary.csproj", "{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Test", "PoyoLang.Test\PoyoLang.Test.csproj", "{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PoyoLang.Dictionary.Generation", "PoyoLang.Dictionary.Generation\PoyoLang.Dictionary.Generation.csproj", "{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2D875AAD-BE17-4D15-A876-19DF1DCC57F5}.Release|Any CPU.Build.0 = Release|Any CPU
{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{4CB193B2-44F2-4926-A56E-9A0CDCBC828C}.Release|Any CPU.Build.0 = Release|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Debug|Any CPU.Build.0 = Debug|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.ActiveCfg = Release|Any CPU
{43FFCEF2-A4AA-49A1-9731-CB6DAD9863F2}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal