word-counts-categorical
Get word counts / frequencies on a per-speaker or per-category basis, or as an aggregate.
Predict category using Bayes' rules.
Optionally stems words using stemmer
Table of Contents
- Installation
- Usage
- Usage - no stemming
- Usage - Bayesian prediction
- Usage - disable LaPlace smoothing
- About
- See Also
Installation
npm i word-counts-categorical
Usage
var wcc = require('word-counts-categorical');
var stuffBobSays = "my name is bob. i like pizza.";
var stuffJaneSays = "my name is jane. i like snowboarding.";
var wordCountAggregator = {};
wcc.countWordsAsCategory(stuffBobSays, "BOB", wordCountAggregator);
wcc.countWordsAsCategory(stuffJaneSays, "JANE", wordCountAggregator);
//you can continue adding words per-category...
//wcc.countWordsAsCategory(moreStuffBobSays, "BOB", wordCountAggregator);
//wcc.countWordsAsCategory(moreStuffJaneSays, "JANE", wordCountAggregator);
console.log(wcc.getWordCountsOverall(wordCountAggregator));
// {
// my: 2,
// name: 2,
// is: 2,
// bob: 1,
// i: 2,
// like: 2,
// pizza: 1,
// jane: 1,
// snowboard: 1
// }
console.log(wcc.getWordFrequenciesOverall(wordCountAggregator));
// {
// my: 0.14285714285714285,
// name: 0.14285714285714285,
// is: 0.14285714285714285,
// bob: 0.07142857142857142,
// i: 0.14285714285714285,
// like: 0.14285714285714285,
// pizza: 0.07142857142857142,
// jane: 0.07142857142857142,
// snowboard: 0.07142857142857142
// }
console.log(wcc.getWordFrequenciesForAllCategories(wordCountAggregator));
// {
// BOB: {
// my: 0.14285714285714285,
// name: 0.14285714285714285,
// is: 0.14285714285714285,
// bob: 0.14285714285714285,
// i: 0.14285714285714285,
// like: 0.14285714285714285,
// pizza: 0.14285714285714285
// },
// JANE: {
// my: 0.14285714285714285,
// name: 0.14285714285714285,
// is: 0.14285714285714285,
// jane: 0.14285714285714285,
// i: 0.14285714285714285,
// like: 0.14285714285714285,
// snowboard: 0.14285714285714285
// }
// }
console.log(wcc.getWordFrequenciesForCategory(wordCountAggregator, "BOB"));
//{
// my: 0.14285714285714285,
// name: 0.14285714285714285,
// is: 0.14285714285714285,
// bob: 0.14285714285714285,
// i: 0.14285714285714285,
// like: 0.14285714285714285,
// pizza: 0.14285714285714285
// }
console.log(wcc.getTotalWordsOverall(wordCountAggregator));
//14
console.log(wcc.getTotalWordsForCategory(wordCountAggregator,"JANE"));
//7
//words shared between categories...
console.log(wcc.getSharedWords(wordCountAggregator));
//[ 'my', 'name', 'is', 'i', 'like' ]
//filter [delete] words with wordcount below cutoff
wcc.filterForMinimumWordCount(wordCountAggregator,2)
Usage - no stemming
//with stemming disabled...
var wordCountAggregator_noStems = {};
var doStem = false; //doStem param is true by default
wcc.countWordsAsCategory(stuffBobSays, "BOB", wordCountAggregator_noStems, doStem);
wcc.countWordsAsCategory(stuffJaneSays, "JANE", wordCountAggregator_noStems, doStem);
//results now include "snowboarding" not "snowboard"
Usage - Bayesian prediction
//bayes probability ...
var doStem = true;
console.log(wcc.getMostLikelyCategory("snowboarding is cool", wordCountAggregator, doStem)); //doStem is optional, defaults to true
//JANE
//get all category bayesian log-probabilities ...
console.log(wcc.getProbabilityForAllCategories("snowboarding is cool", wordCountAggregator, doStem));
//[ [ 'BOB', -8.0507033814703 ], [ 'JANE', -7.9171719888457766 ] ]
//for single category ...
console.log(wcc.getProbabilityForStringInCategory("snowboarding is cool", wordCountAggregator,"BOB", doStem));
//-8.0507033814703
Usage - disable LaPlace smoothing
//disable +1 laplace smoothing...
wcc.enableLaPlaceSmooth(false);
console.log(wcc.getProbabilityForStringInCategory("snowboarding is fun", wordCountAggregator,"BOB", doStem));
//result is now -7.783640596221254 [result was -8.0507033814703 with smoothing enabled]
About
Built by MarketerRank.
See Also
bayes - similar tool; source of laplace-smoothed bayes algorithm