Enfore that all one-grams are present in the dictionary to make sure it's possible to write any sentence
140 lines
3.6 KiB
C#
140 lines
3.6 KiB
C#
// See https://aka.ms/new-console-template for more information
|
|
|
|
// Load up the file
|
|
|
|
using System.Reflection;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Text.Json;
|
|
using ClosedXML.Excel;
|
|
using PoyoLang.Dictionary;
|
|
|
|
const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
|
|
|
|
JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true };
|
|
|
|
Console.WriteLine("Reading Excel file...");
|
|
|
|
// Load Excel data file
|
|
await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
|
|
|
|
using var workBook = new XLWorkbook(excelFileStream);
|
|
var worksheet = workBook.Worksheet("List");
|
|
|
|
Console.WriteLine("Reading word frequencies");
|
|
|
|
// Read word frequencies
|
|
var wordColumn = "C";
|
|
var frequencyColumn = "D";
|
|
|
|
var wordFrequencies = new List<(string word, long frequency)>();
|
|
|
|
var row = 2;
|
|
|
|
while (true)
|
|
{
|
|
var wordValue = worksheet.Cell(row, wordColumn).Value;
|
|
|
|
if (wordValue.IsBlank)
|
|
{
|
|
break;
|
|
}
|
|
|
|
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
|
|
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
|
|
|
|
if (!word.Contains('('))
|
|
{
|
|
wordFrequencies.Add((word, frequency));
|
|
}
|
|
|
|
row++;
|
|
}
|
|
|
|
Console.WriteLine("Computing ngrams");
|
|
|
|
// Compute n-grams
|
|
var ngrams = new Dictionary<string, long>();
|
|
|
|
const int MaxLength = 8;
|
|
const int MinLength = 2;
|
|
|
|
foreach (var (word, frequency) in wordFrequencies)
|
|
{
|
|
var span = word.AsSpan();
|
|
|
|
while (span.Length >= MinLength)
|
|
{
|
|
for (int length = MinLength; length <= MaxLength; length++)
|
|
{
|
|
if (length > span.Length)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Increment(span[..length]);
|
|
}
|
|
|
|
span = span[1..];
|
|
}
|
|
|
|
continue;
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
void Increment(ReadOnlySpan<char> span)
|
|
{
|
|
var ngram = span.ToString();
|
|
|
|
if (ngrams.TryGetValue(ngram, out var count))
|
|
{
|
|
ngrams[ngram] = count + frequency;
|
|
}
|
|
else
|
|
{
|
|
ngrams[ngram] = frequency;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Order frequencies
|
|
var orderedNgrams = ngrams
|
|
.OrderByDescending(n => n.Value)
|
|
.ToList();
|
|
|
|
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
|
|
|
|
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions);
|
|
|
|
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
|
|
|
|
Console.WriteLine("Generating dictionary...");
|
|
|
|
// Generate dictionary
|
|
var dictionary = new Dictionary<string, string>();
|
|
var ngramIndex = 0;
|
|
|
|
// Prepend base letters to make sure all words are writeable
|
|
string[] oneGrams = [
|
|
" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
|
|
];
|
|
var fullNgrams = oneGrams.Concat(orderedNgrams.Select(p => p.Key)).ToArray();
|
|
|
|
foreach (var letter in Alphabet.BaseAlphabet)
|
|
{
|
|
dictionary[letter] = fullNgrams[ngramIndex];
|
|
|
|
ngramIndex++;
|
|
}
|
|
|
|
await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions));
|
|
|
|
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");
|
|
|
|
// Also write in simple custom format for source generator
|
|
await using var customOutput = File.CreateText("dictionary.txt");
|
|
|
|
foreach (var pair in dictionary)
|
|
{
|
|
await customOutput.WriteLineAsync($"{pair.Key}={pair.Value}");
|
|
}
|
|
|
|
Console.WriteLine($"Custom dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.txt")}"); |