Files
PoyoLang/PoyoLang.Dictionary.Generation/Program.cs
2025-05-13 21:07:53 +02:00

124 lines
2.9 KiB
C#

// See https://aka.ms/new-console-template for more information
// Load up the file
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text.Json;
using ClosedXML.Excel;
using PoyoLang.Dictionary;
const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true };
Console.WriteLine("Reading Excel file...");
// Load Excel data file
await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
using var workBook = new XLWorkbook(excelFileStream);
var worksheet = workBook.Worksheet("List");
Console.WriteLine("Reading word frequencies");
// Read word frequencies
var wordColumn = "C";
var frequencyColumn = "D";
var wordFrequencies = new List<(string word, long frequency)>();
var row = 2;
while (true)
{
var wordValue = worksheet.Cell(row, wordColumn).Value;
if (wordValue.IsBlank)
{
break;
}
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
if (!word.Contains('('))
{
wordFrequencies.Add((word, frequency));
}
row++;
}
Console.WriteLine("Computing ngrams");
// Compute n-grams
var ngrams = new Dictionary<string, long>();
const int MaxLength = 8;
const int MinLength = 1;
foreach (var (word, frequency) in wordFrequencies)
{
var span = word.AsSpan();
while (span.Length >= MinLength)
{
for (int length = MinLength; length <= MaxLength; length++)
{
if (length > span.Length)
{
break;
}
Increment(span[..length]);
}
span = span[1..];
}
continue;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
void Increment(ReadOnlySpan<char> span)
{
var ngram = span.ToString();
if (ngrams.TryGetValue(ngram, out var count))
{
ngrams[ngram] = count + frequency;
}
else
{
ngrams[ngram] = frequency;
}
}
}
// Order frequencies
var orderedNgrams = ngrams
.OrderByDescending(n => n.Value)
.ToList();
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions);
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
Console.WriteLine("Generating dictionary...");
// Generate dictionary
var dictionary = new Dictionary<string, string>();
var ngramIndex = 0;
foreach (var letter in Alphabet.BaseAlphabet)
{
dictionary[letter] = orderedNgrams[ngramIndex].Key;
ngramIndex++;
}
await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions));
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");