Jeff's Barney's #1 Barney's #2
array
1 course,
2 horse
3 horse,
4 smog
5 course.
6 magma
7 glasses
8 cannot
9 silly
10 mouse
array
1 horse
2 course
3 smog
4 mouse
5 helicopter
6 talk
7 keyboard
8 cannot
9 magma
10 silly
query
  OCCURRENCES WORD
1 4 horse
2 3 course
3 2 smog
4 1 mouse
5 1 helicopter
6 1 talk
7 1 keyboard
8 1 cannot
9 1 magma
10 1 silly

<cfparam name="attributes.text" default="A horse is a horse, of course, of course, and you cannot talk to a horse, of course.  smog liquid magma smog silly horse helicopter keyboard mouse glasses vodka." />

<cfset ignoreList = "all,another,any,anybody,anyone,anything,both,each,either,everybody,everyone,everything,few,he,her,hers,herself,him,himself,his,I,it,its,itself,little,many,me,mine,more,most,much,myself,neither,no,one,nobody,none,nothing,one,one another,other,others,ours,ourselves,several,she,some,somebody,someone,something,that,theirs,them,themselves,these,they,this,those,us,we,what,whatever,which,whichever,who,whoever,whom,whomever,whose,you,yours,yourself,yourselves,,a,the,to,are,of,can,is,but,have,that,want,What,my,an,for,all,out,and,look,very,need,get,case" />

<cfset hb1 = getHistogram_barney1(attributes.text, ignoreList, 10) />
<cfset hb2 = getHistogram_barney2(attributes.text, ignoreList, 10) />
<cfset hj = getHistogram_jeff(attributes.text, ignoreList, 10) />

<cfoutput>
<style type="text/css">
td { vertical-align: top }
</style>
<form method="post" action="?">
<textarea name="text" style="width:650px;height:100px;">#htmlEditFormat(attributes.text)#</textarea><br />
<input type="submit" value="build histogram" />
</form>

<table>
<tr>
	<th>Jeff's</th>
	<th>Barney's ##1</th>
	<th>Barney's ##2</th>
</tr>
<tr>
	<td><cfdump var="#hj#" /></td>
	<td><cfdump var="#hb1#" /></td>
	<td><cfdump var="#hb2#" /></td>
</tr>
</table>
<hr />
<pre>#htmlEditFormat(fileRead(getCurrentTemplatePath()))#</pre>
</cfoutput>

<cffunction name="getHistogram_barney1">
	<cfargument name="text" />
	<cfargument name="ignoreList" />
	<cfargument name="maxItems" />
	<cfset var list = "" />
	<cfset var lookup = {} />
	<cfset var word = "" />
	<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
	<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
	<cfloop list="#text#" index="word" delimiters=" ">
		<!--- c'mon, where's CFCONTINE? --->
		<cfif NOT ignores.contains(word)>
			<cfif structKeyExists(lookup, word)>
				<cfset lookup[word] += 1 />
			<cfelse>
				<cfset lookup[word] = 1 />
			</cfif>
		</cfif>
	</cfloop>
	<cfset list = structSort(lookup, "numeric", "desc") />
	<cfif arrayLen(list) GT maxItems>
		<cfset list = list.subList(0, maxItems) />
	</cfif>
	<cfreturn list />
</cffunction>


<cffunction name="getHistogram_barney2">
	<cfargument name="text" />
	<cfargument name="ignoreList" />
	<cfargument name="maxItems" />
	<cfset var result = "" />
	<cfset var lookup = {} />
	<cfset var list = "" />
	<cfset var word = "" />
	<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
	<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
	<cfloop list="#text#" index="word" delimiters=" ">
		<!--- c'mon, where's CFCONTINE? --->
		<cfif NOT ignores.contains(word)>
			<cfif structKeyExists(lookup, word)>
				<cfset lookup[word] += 1 />
			<cfelse>
				<cfset lookup[word] = 1 />
			</cfif>
		</cfif>
	</cfloop>
	<cfset list = structSort(lookup, "numeric", "desc") />
	<cfif arrayLen(list) GT maxItems>
		<cfset list = list.subList(0, maxItems) />
	</cfif>
	<!--- version 1 returned 'list', here create a query with counts   --->
	<cfset result = queryNew("word,occurrences", "varchar,integer") />
	<cfloop array="#list#" index="word">
		<cfset queryAddRow(result) />
		<cfset querySetCell(result, "word", word) />
		<cfset querySetCell(result, "occurrences", lookup[word]) />
	</cfloop>
	<cfreturn result />
</cffunction>


<cffunction name="getHistogram_jeff" returntype="array" hint="Creats a histogram of words">
    <cfargument name="sourceText" required="true" hint="The string of text we want to generate a histogram for" type="string" />
    <cfargument name="ignoreList" required="false" hint="comma delineated list of words to ignore" type="string" />
    <cfargument name="histogramLength" required="false" hint="number of words that we want to send back..ie only the top 5" type="string" />

    <cfset var histogramCount = structNew() /> <!--- our histogram! --->
    <cfset var sortedHistogram = "" />  <!--- a sorted array of our histogram --->
    <cfset var x = "" /> <!--- iterator --->
    <cfset var i = "" /> <!--- iterator --->
   
    <!--- loop through all of the text, assuming that a space separates a word --->
    <cfloop delimiters=" " list="#sourceText#" index="i">
   
        <!--- see if we have this already in our struct --->
        <cfif structKeyExists(histogramCount, "#i#")>
            <!--- we do! increase its count by 1 --->
            <cfset histogramCount[i] = histogramCount[i] + 1 />
        <cfelse>
            <!--- we do not, make a new key in the struct for this word --->
            <cfset histogramCount[i] = 1 />
        </cfif>
    </cfloop>
   
   
    <!--- Do we have an ignore list? --->
    <cfif structKeyExists(arguments, "ignoreList") and len(trim(arguments.ignoreList))>
        <!--- loop over the list of ignore words and remove any matches from our structure --->
        <cfloop delimiters="," list="#arguments.ignoreList#" index="x">
            <!--- does this word occur in our struct? --->
            <cfif structKeyExists(histogramCount, x)>
                <!--- yes, so remove it --->
                <cfset structDelete(histogramCount, x) />
            </cfif>
        </cfloop>
    </cfif>

    <!--- Sort the histogram based on most occurences of a given word --->
    <cfset sortedHistogram = StructSort(histogramCount, "numeric", "DESC") />

    <!--- see if we need to only show x number of words for this histogram --->
    <cfif structKeyExists(arguments, "histogramLength") and len(trim(arguments.histogramLength))>
        <cfset useNum = arguments.histogramLength + 1 />
        <cfloop index="y" from="#arrayLen(sortedHistogram)#" to="#useNum#" step="-1">

            <cfset ArrayDeleteAt(sortedHistogram, y) />       
        </cfloop>
    </cfif>

    <cfreturn sortedHistogram>

</cffunction>