Add dictionary generation
This commit is contained in:
124
PoyoLang.Dictionary.Generation/Program.cs
Normal file
124
PoyoLang.Dictionary.Generation/Program.cs
Normal file
@@ -0,0 +1,124 @@
|
||||
// See https://aka.ms/new-console-template for more information
|
||||
|
||||
// Load up the file
|
||||
|
||||
using System.Reflection;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text.Json;
|
||||
using ClosedXML.Excel;
|
||||
using PoyoLang.Dictionary;
|
||||
|
||||
const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
|
||||
|
||||
JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true };
|
||||
|
||||
Console.WriteLine("Reading Excel file...");
|
||||
|
||||
// Load Excel data file
|
||||
await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
|
||||
|
||||
using var workBook = new XLWorkbook(excelFileStream);
|
||||
var worksheet = workBook.Worksheet("List");
|
||||
|
||||
Console.WriteLine("Reading word frequencies");
|
||||
|
||||
// Read word frequencies
|
||||
var wordColumn = "C";
|
||||
var frequencyColumn = "D";
|
||||
|
||||
var wordFrequencies = new List<(string word, long frequency)>();
|
||||
|
||||
var row = 2;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var wordValue = worksheet.Cell(row, wordColumn).Value;
|
||||
|
||||
if (wordValue.IsBlank)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
|
||||
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
|
||||
|
||||
if (!word.Contains('('))
|
||||
{
|
||||
wordFrequencies.Add((word, frequency));
|
||||
}
|
||||
|
||||
row++;
|
||||
}
|
||||
|
||||
Console.WriteLine("Computing ngrams");
|
||||
|
||||
// Compute n-grams
|
||||
var ngrams = new Dictionary<string, long>();
|
||||
|
||||
const int MaxLength = 8;
|
||||
const int MinLength = 1;
|
||||
|
||||
foreach (var (word, frequency) in wordFrequencies)
|
||||
{
|
||||
var span = word.AsSpan();
|
||||
|
||||
while (span.Length >= MinLength)
|
||||
{
|
||||
for (int length = MinLength; length <= MaxLength; length++)
|
||||
{
|
||||
if (length > span.Length)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Increment(span[..length]);
|
||||
}
|
||||
|
||||
span = span[1..];
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
void Increment(ReadOnlySpan<char> span)
|
||||
{
|
||||
var ngram = span.ToString();
|
||||
|
||||
if (ngrams.TryGetValue(ngram, out var count))
|
||||
{
|
||||
ngrams[ngram] = count + frequency;
|
||||
}
|
||||
else
|
||||
{
|
||||
ngrams[ngram] = frequency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Order frequencies
|
||||
var orderedNgrams = ngrams
|
||||
.OrderByDescending(n => n.Value)
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
|
||||
|
||||
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions);
|
||||
|
||||
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
|
||||
|
||||
Console.WriteLine("Generating dictionary...");
|
||||
|
||||
// Generate dictionary
|
||||
var dictionary = new Dictionary<string, string>();
|
||||
var ngramIndex = 0;
|
||||
|
||||
foreach (var letter in Alphabet.BaseAlphabet)
|
||||
{
|
||||
dictionary[letter] = orderedNgrams[ngramIndex].Key;
|
||||
|
||||
ngramIndex++;
|
||||
}
|
||||
|
||||
await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions));
|
||||
|
||||
Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}");
|
||||
Reference in New Issue
Block a user