word-counts-categorical

0.0.165 • Public • Published

word-counts-categorical

Get word counts / frequencies on a per-speaker or per-category basis, or as an aggregate.

Predict category using Bayes' rules.

Optionally stems words using stemmer

Table of Contents

Installation

npm i word-counts-categorical

Usage

var wcc = require('word-counts-categorical');

var stuffBobSays = "my name is bob. i like pizza.";
var stuffJaneSays = "my name is jane. i like snowboarding.";

var wordCountAggregator = {};
wcc.countWordsAsCategory(stuffBobSays, "BOB", wordCountAggregator);
wcc.countWordsAsCategory(stuffJaneSays, "JANE", wordCountAggregator);

//you can continue adding words per-category...
//wcc.countWordsAsCategory(moreStuffBobSays, "BOB", wordCountAggregator);
//wcc.countWordsAsCategory(moreStuffJaneSays, "JANE", wordCountAggregator);

console.log(wcc.getWordCountsOverall(wordCountAggregator));
// {
//     my: 2,
//     name: 2,
//     is: 2,
//     bob: 1,
//     i: 2,
//     like: 2,
//     pizza: 1,
//     jane: 1,
//     snowboard: 1
// }


console.log(wcc.getWordFrequenciesOverall(wordCountAggregator));
// {
//     my: 0.14285714285714285,
//     name: 0.14285714285714285,
//     is: 0.14285714285714285,
//     bob: 0.07142857142857142,
//     i: 0.14285714285714285,
//     like: 0.14285714285714285,
//     pizza: 0.07142857142857142,
//     jane: 0.07142857142857142,
//     snowboard: 0.07142857142857142
// }


console.log(wcc.getWordFrequenciesForAllCategories(wordCountAggregator));
// {
//     BOB: {
//         my: 0.14285714285714285,
//         name: 0.14285714285714285,
//         is: 0.14285714285714285,
//         bob: 0.14285714285714285,
//         i: 0.14285714285714285,
//         like: 0.14285714285714285,
//         pizza: 0.14285714285714285
//     },
//     JANE: {
//         my: 0.14285714285714285,
//         name: 0.14285714285714285,
//         is: 0.14285714285714285,
//         jane: 0.14285714285714285,
//         i: 0.14285714285714285,
//         like: 0.14285714285714285,
//         snowboard: 0.14285714285714285
//     }
// }


console.log(wcc.getWordFrequenciesForCategory(wordCountAggregator, "BOB"));
//{
//     my: 0.14285714285714285,
//     name: 0.14285714285714285,
//     is: 0.14285714285714285,
//     bob: 0.14285714285714285,
//     i: 0.14285714285714285,
//     like: 0.14285714285714285,
//     pizza: 0.14285714285714285
// }

console.log(wcc.getTotalWordsOverall(wordCountAggregator));
//14
console.log(wcc.getTotalWordsForCategory(wordCountAggregator,"JANE"));
//7


//words shared between categories...
console.log(wcc.getSharedWords(wordCountAggregator));
//[ 'my', 'name', 'is', 'i', 'like' ]

//filter [delete] words with wordcount below cutoff
wcc.filterForMinimumWordCount(wordCountAggregator,2)

Usage - no stemming

//with stemming disabled...
var wordCountAggregator_noStems = {};
var doStem = false; //doStem param is true by default
wcc.countWordsAsCategory(stuffBobSays, "BOB", wordCountAggregator_noStems, doStem);
wcc.countWordsAsCategory(stuffJaneSays, "JANE", wordCountAggregator_noStems, doStem);

//results now include "snowboarding" not "snowboard"

Usage - Bayesian prediction

//bayes probability ...
var doStem = true;
console.log(wcc.getMostLikelyCategory("snowboarding is cool", wordCountAggregator, doStem)); //doStem is optional, defaults to true
//JANE

//get all category bayesian log-probabilities ...
console.log(wcc.getProbabilityForAllCategories("snowboarding is cool", wordCountAggregator, doStem));
//[ [ 'BOB', -8.0507033814703 ], [ 'JANE', -7.9171719888457766 ] ]

//for single category ...
console.log(wcc.getProbabilityForStringInCategory("snowboarding is cool", wordCountAggregator,"BOB", doStem));
//-8.0507033814703

Usage - disable LaPlace smoothing

//disable +1 laplace smoothing...
wcc.enableLaPlaceSmooth(false);
console.log(wcc.getProbabilityForStringInCategory("snowboarding is fun", wordCountAggregator,"BOB", doStem));
//result is now -7.783640596221254 [result was -8.0507033814703 with smoothing enabled]

About

Built by MarketerRank.

See Also

bayes - similar tool; source of laplace-smoothed bayes algorithm

Package Sidebar

Install

npm i word-counts-categorical

Weekly Downloads

2

Version

0.0.165

License

MIT

Unpacked Size

18.7 kB

Total Files

4

Last publish

Collaborators

  • marketerrank