// See https://aka.ms/new-console-template for more information // Load up the file using System.Reflection; using System.Runtime.CompilerServices; using System.Text.Json; using ClosedXML.Excel; Console.WriteLine("Reading Excel file..."); // Load Excel data file const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx"; using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath); using var workBook = new XLWorkbook(excelFileStream); var worksheet = workBook.Worksheet("List"); Console.WriteLine("Reading word frequencies"); // Read word frequencies var wordColumn = "C"; var frequencyColumn = "D"; var wordFrequencies = new List<(string word, long frequency)>(); var row = 2; while (true) { var wordValue = worksheet.Cell(row, wordColumn).Value; if (wordValue.IsBlank) { break; } var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} "; var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble(); if (!word.Contains('(')) { wordFrequencies.Add((word, frequency)); } row++; } Console.WriteLine("Computing ngrams"); // Compute n-grams var ngrams = new Dictionary(); const int MaxLength = 5; const int MinLength = 1; foreach (var (word, frequency) in wordFrequencies) { var span = word.AsSpan(); while (span.Length >= MinLength) { for (int length = MinLength; length <= MaxLength; length++) { if (length > span.Length) { break; } Increment(span[..length]); } span = span[1..]; } continue; [MethodImpl(MethodImplOptions.AggressiveInlining)] void Increment(ReadOnlySpan span) { var ngram = span.ToString(); if (ngrams.TryGetValue(ngram, out var count)) { ngrams[ngram] = count + frequency; } else { ngrams[ngram] = frequency; } } } // Order frequencies var orderedNgrams = ngrams .OrderByDescending(n => n.Value) .ToList(); Console.WriteLine($"Found {orderedNgrams.Count} n-grams"); var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true}); await File.WriteAllTextAsync("n-grams.json", serializedNgrams);