Add dictionary generation
This commit is contained in:
105
PoyoLang.Analysis.NGrams/Program.cs
Normal file
105
PoyoLang.Analysis.NGrams/Program.cs
Normal file
@@ -0,0 +1,105 @@
|
||||
// See https://aka.ms/new-console-template for more information
|
||||
|
||||
// Load up the file
|
||||
|
||||
using System.Reflection;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text.Json;
|
||||
using ClosedXML.Excel;
|
||||
|
||||
Console.WriteLine("Reading Excel file...");
|
||||
|
||||
// Load Excel data file
|
||||
const string ExcelFilePath = "PoyoLang.Analysis.NGrams.60000 Most Frequent Words in American English - Corpus of Contemporary American English.xlsx";
|
||||
|
||||
using var excelFileStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(ExcelFilePath);
|
||||
|
||||
using var workBook = new XLWorkbook(excelFileStream);
|
||||
var worksheet = workBook.Worksheet("List");
|
||||
|
||||
Console.WriteLine("Reading word frequencies");
|
||||
|
||||
// Read word frequencies
|
||||
var wordColumn = "C";
|
||||
var frequencyColumn = "D";
|
||||
|
||||
var wordFrequencies = new List<(string word, long frequency)>();
|
||||
|
||||
var row = 2;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var wordValue = worksheet.Cell(row, wordColumn).Value;
|
||||
|
||||
if (wordValue.IsBlank)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
var word = $"{wordValue.GetText().Trim().ToLowerInvariant()} ";
|
||||
var frequency = (long)worksheet.Cell(row, frequencyColumn).GetDouble();
|
||||
|
||||
if (!word.Contains('('))
|
||||
{
|
||||
wordFrequencies.Add((word, frequency));
|
||||
}
|
||||
|
||||
row++;
|
||||
}
|
||||
|
||||
Console.WriteLine("Computing ngrams");
|
||||
|
||||
// Compute n-grams
|
||||
var ngrams = new Dictionary<string, long>();
|
||||
|
||||
const int MaxLength = 5;
|
||||
const int MinLength = 1;
|
||||
|
||||
foreach (var (word, frequency) in wordFrequencies)
|
||||
{
|
||||
var span = word.AsSpan();
|
||||
|
||||
while (span.Length >= MinLength)
|
||||
{
|
||||
for (int length = MinLength; length <= MaxLength; length++)
|
||||
{
|
||||
if (length > span.Length)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Increment(span[..length]);
|
||||
}
|
||||
|
||||
span = span[1..];
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
void Increment(ReadOnlySpan<char> span)
|
||||
{
|
||||
var ngram = span.ToString();
|
||||
|
||||
if (ngrams.TryGetValue(ngram, out var count))
|
||||
{
|
||||
ngrams[ngram] = count + frequency;
|
||||
}
|
||||
else
|
||||
{
|
||||
ngrams[ngram] = frequency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Order frequencies
|
||||
var orderedNgrams = ngrams
|
||||
.OrderByDescending(n => n.Value)
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {orderedNgrams.Count} n-grams");
|
||||
|
||||
var serializedNgrams = JsonSerializer.Serialize(orderedNgrams, new JsonSerializerOptions() { WriteIndented = true});
|
||||
|
||||
await File.WriteAllTextAsync("n-grams.json", serializedNgrams);
|
||||
|
||||
Reference in New Issue
Block a user