106 lines
2.4 KiB
C#
106 lines
2.4 KiB
C#
// See https://aka.ms/new-console-template for more information
|
|
|
|
// Load up the file
|
|
|
|
using System.Reflection;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Text.Json;
|
|
using ClosedXML.Excel;
|
|
|
|
Console.WriteLine("Reading Excel file...");
|
|
|
|
// Load Excel data file
|
|
const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
|
|
|
|
using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
|
|
|
|
using var workBook = new XLWorkbook(excelFileStream);
|
|
var worksheet = workBook.Worksheet("List");
|
|
|
|
Console.WriteLine("Reading word frequencies");
|
|
|
|
// Read word frequencies
|
|
var wordColumn = "C";
|
|
var frequencyColumn = "D";
|
|
|
|
var wordFrequencies = new List<(string word, long frequency)>();
|
|
|
|
var row = 2;
|
|
|
|
while (true)
|
|
{
|
|
var wordValue = worksheet.Cell(row, wordColumn).Value;
|
|
|
|
if (wordValue.IsBlank)
|
|
{
|
|
break;
|
|
}
|
|
|
|
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
|
|
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
|
|
|
|
if (!word.Contains('('))
|
|
{
|
|
wordFrequencies.Add((word, frequency));
|
|
}
|
|
|
|
row++;
|
|
}
|
|
|
|
Console.WriteLine("Computing ngrams");
|
|
|
|
// Compute n-grams
|
|
var ngrams = new Dictionary<string, long>();
|
|
|
|
const int MaxLength = 5;
|
|
const int MinLength = 1;
|
|
|
|
foreach (var (word, frequency) in wordFrequencies)
|
|
{
|
|
var span = word.AsSpan();
|
|
|
|
while (span.Length >= MinLength)
|
|
{
|
|
for (int length = MinLength; length <= MaxLength; length++)
|
|
{
|
|
if (length > span.Length)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Increment(span[..length]);
|
|
}
|
|
|
|
span = span[1..];
|
|
}
|
|
|
|
continue;
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
void Increment(ReadOnlySpan<char> span)
|
|
{
|
|
var ngram = span.ToString();
|
|
|
|
if (ngrams.TryGetValue(ngram, out var count))
|
|
{
|
|
ngrams[ngram] = count + frequency;
|
|
}
|
|
else
|
|
{
|
|
ngrams[ngram] = frequency;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Order frequencies
|
|
var orderedNgrams = ngrams
|
|
.OrderByDescending(n => n.Value)
|
|
.ToList();
|
|
|
|
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
|
|
|
|
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true});
|
|
|
|
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
|
|
|