// See https://aka.ms/new-console-template for more information // Load up the file using System.Reflection; using System.Runtime.CompilerServices; using System.Text.Json; using ClosedXML.Excel; using PoyoLang.Dictionary; const string ExcelFilePath = "PoyoLang.Dictionary.Generation.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx"; JsonSerializerOptions jsonOptions = new JsonSerializerOptions() { WriteIndented = true }; Console.WriteLine("Reading Excel file..."); // Load Excel data file await using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath); using var workBook = new XLWorkbook(excelFileStream); var worksheet = workBook.Worksheet("List"); Console.WriteLine("Reading word frequencies"); // Read word frequencies var wordColumn = "C"; var frequencyColumn = "D"; var wordFrequencies = new List<(string word, long frequency)>(); var row = 2; while (true) { var wordValue = worksheet.Cell(row, wordColumn).Value; if (wordValue.IsBlank) { break; } var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} "; var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble(); if (!word.Contains('(')) { wordFrequencies.Add((word, frequency)); } row++; } Console.WriteLine("Computing ngrams"); // Compute n-grams var ngrams = new Dictionary(); const int MaxLength = 8; const int MinLength = 1; foreach (var (word, frequency) in wordFrequencies) { var span = word.AsSpan(); while (span.Length >= MinLength) { for (int length = MinLength; length <= MaxLength; length++) { if (length > span.Length) { break; } Increment(span[..length]); } span = span[1..]; } continue; [MethodImpl(MethodImplOptions.AggressiveInlining)] void Increment(ReadOnlySpan span) { var ngram = span.ToString(); if (ngrams.TryGetValue(ngram, out var count)) { ngrams[ngram] = count + frequency; } else { ngrams[ngram] = frequency; } } } // Order frequencies var orderedNgrams = ngrams .OrderByDescending(n => n.Value) .ToList(); Console.WriteLine($"Found {orderedNgrams.Count} n-grams"); var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, jsonOptions); await File.WriteAllTextAsync("n-grams.json", serializedNgrams); Console.WriteLine("Generating dictionary..."); // Generate dictionary var dictionary = new Dictionary(); var ngramIndex = 0; foreach (var letter in Alphabet.BaseAlphabet) { dictionary[letter] = orderedNgrams[ngramIndex].Key; ngramIndex++; } await File.WriteAllTextAsync("dictionary.json", JsonSerializer.Serialize(dictionary, jsonOptions)); Console.WriteLine($"Dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.json")}"); // Also write in simple custom format for source generator await using var customOutput = File.CreateText("dictionary.txt"); foreach (var pair in dictionary) { await customOutput.WriteLineAsync($"{pair.Key}={pair.Value}"); } Console.WriteLine($"Custom dictionary written to {Path.Combine(Environment.CurrentDirectory, "dictionary.txt")}");