From e99cfeee8d7b69adeb9dcd87b733eb2c926a0037 Mon Sep 17 00:00:00 2001 From: spydaz Date: Mon, 21 Aug 2023 14:22:59 +0300 Subject: [PATCH] Updated --- SourceCode/Embeddings.vb | 32 +- SourceCode/Entailment.vb | 291 +- SourceCode/EntityModels.vb | 247 +- SourceCode/Examples.vb | 132 +- SourceCode/InputModels.vb | 1974 ++----------- SourceCode/LanguageModels.vb | 1969 +------------ SourceCode/Storage.vb | 8 +- SourceCode/Tokenizers.vb | 1378 +++++++++ SourceCode/Trees.vb | 20 +- SourceCode/Utilitys.vb | 4649 +++++++++++++++--------------- SpydazWebAI_InputModeller.vbproj | 1 + 11 files changed, 4637 insertions(+), 6064 deletions(-) create mode 100644 SourceCode/Tokenizers.vb diff --git a/SourceCode/Embeddings.vb b/SourceCode/Embeddings.vb index 3f107d8..4ad5821 100644 --- a/SourceCode/Embeddings.vb +++ b/SourceCode/Embeddings.vb @@ -13,7 +13,7 @@ Imports MathNet.Numerics.IntegralTransforms Namespace Models Namespace Embeddings - + Public MustInherit Class WordEmbeddingsModel ' A simple vocabulary for demonstration purposes. @@ -424,6 +424,7 @@ Namespace Models ''' This Step enables you To refine the word embeddings ''' And make them more accurate And context-specific. ''' + Public Class HybridWordEmbeddingsModel Inherits WordEmbeddingsModel @@ -583,6 +584,7 @@ Namespace Models ''' ''' This is a TFIDF Vectorizer For basic Embeddings ''' + Public Class Sentence2Vector Private ReadOnly documents As List(Of List(Of String)) Private ReadOnly idf As Dictionary(Of String, Double) @@ -648,15 +650,15 @@ Namespace Models End Class Public Class Word2Vector - Private embeddingMatrix As Double(,) - Private embeddingSize As Integer - Private indexToWord As Dictionary(Of Integer, String) - Private learningRate As Double - Private negativeSamples As Integer - Private vocabulary As HashSet(Of String) - Private weights As Double() - Private windowSize As Integer - Private wordToIndex As Dictionary(Of String, Integer) + Public embeddingMatrix As Double(,) + Public embeddingSize As Integer + Public indexToWord As Dictionary(Of Integer, String) + Public learningRate As Double + Public negativeSamples As Integer + Public vocabulary As HashSet(Of String) + Public weights As Double() + Public windowSize As Integer + Public wordToIndex As Dictionary(Of String, Integer) Public Sub New(embeddingSize As Integer, learningRate As Double, windowSize As Integer, negativeSamples As Integer) Me.embeddingSize = embeddingSize @@ -1034,6 +1036,7 @@ Namespace Models ''' May sacrifice quality: With negative sampling, some negative samples may Not be truly informative, potentially leading To a slight degradation In the quality Of learned word embeddings compared To hierarchical softmax. ''' Tuning hyperparameters: The effectiveness Of negative sampling depends On the selection Of the number Of negative samples And learning rate, which may require tuning. ''' + Public Class WordEmbeddingsWithNegativeSampling Inherits WordEmbeddingsModel Public NumNegativeSamples As Integer = 5 ' Number of negative samples per positive sample. @@ -1223,6 +1226,7 @@ Namespace Models ''' Computationally expensive For large vocabularies: Hierarchical softmax can become computationally expensive With larger vocabularies, As it requires traversing a binary tree To compute probabilities For Each word during training. ''' More complex To implement: Implementing hierarchical softmax can be more complex compared To negative sampling. ''' + Public Class WordEmbeddingsWithHierarchicalSoftmax Inherits WordEmbeddingsModel Public Sub New(ByRef model As WordEmbeddingsModel) @@ -1465,6 +1469,7 @@ Namespace Models End Class End Class + Public Class WordEmbeddingsWithGloVe Inherits WordEmbeddingsModel @@ -1601,6 +1606,7 @@ Namespace Models End Sub End Class + Public Class WordEmbeddingsWithFastText Inherits WordEmbeddingsModel @@ -1714,6 +1720,7 @@ Namespace Models Next End Sub End Class + Public Class WordEmbeddingsWithCBOW Inherits WordEmbeddingsModel @@ -1854,6 +1861,7 @@ Namespace Models Next End Sub End Class + Public Class WordEmbeddingWithTemplate Inherits WordEmbeddingsModel @@ -1873,6 +1881,7 @@ Namespace Models Throw New NotImplementedException() End Sub End Class + Public Class WordEmbeddingWithSentiment Inherits WordEmbeddingsModel @@ -2102,6 +2111,7 @@ Namespace Models End Function End Class + Public Class WordEmbeddingWithTfIdf Inherits WordEmbeddingsModel @@ -2268,6 +2278,7 @@ Namespace Models End Namespace Namespace Audio + Public Class Audio2Vector Public Shared Function AudioToVector(audioSignal As Double(), windowSize As Integer, hopSize As Integer) As List(Of Complex()) Dim vectors As New List(Of Complex()) @@ -2382,6 +2393,7 @@ Namespace Models End Class End Namespace Namespace Images + Public Class Image2Vector Public Shared Sub SaveVectorToFile(imgVector As Double(), outputPath As String) Using writer As New System.IO.StreamWriter(outputPath) diff --git a/SourceCode/Entailment.vb b/SourceCode/Entailment.vb index d65e983..f28d293 100644 --- a/SourceCode/Entailment.vb +++ b/SourceCode/Entailment.vb @@ -5,23 +5,27 @@ Imports InputModelling.Models.Readers Namespace Models Namespace Entailment + Public Structure CapturedType Public Property Sentence As String Public Property LogicalRelation_ As String Public Property SubType As String End Structure + Public Structure ClassifiedSentence Public Classified As Boolean Public Type As String Public Entity As CapturedType End Structure + Public Structure ClassificationRule Public Property Type As String Public Property Subtype As String Public Property Relationship As String Public Property Patterns As List(Of Regex) End Structure + Public Class EntailmentClassifier Public conclusionIndicators As String() = {"therefore", "thus", "consequently", "hence", "in conclusion", "Therefore", "Thus", "As a result"} Public hypothesisIndicators As String() = {"if", "when", "suppose that", "let's say", "assuming that", "Suppose", "Assume", "In the case that", "Given that"} @@ -277,6 +281,7 @@ Namespace Models End Function End Class + Public Class LogicalDependencyClassifier Private Shared ReadOnly CauseAndEffectPattern As Regex = New Regex("(?i)(cause|effect|result in|lead to|because|due to|consequently)") Private Shared ReadOnly ComparisonPattern As Regex = New Regex("(?i)(compared to|greater than|less than|similar to|different from|between)") @@ -371,6 +376,7 @@ Namespace Models End Function End Class + Public Class SentenceClassifier @@ -2202,7 +2208,7 @@ Namespace Models Public Shared Function GetSentence(sentence As String) As CapturedType Dim lowercaseSentence As String = sentence.ToLower() Dim newType As New CapturedType With { - .sentence = lowercaseSentence, + .Sentence = lowercaseSentence, .LogicalRelation_ = LogicalDependencyClassifier.ClassifyLogicalDependency(lowercaseSentence) } @@ -2301,6 +2307,7 @@ Namespace Models End Class End Class + Public Class LogicalArgumentClassifier Private Shared ClassificationRules As List(Of ClassificationRule) @@ -2426,6 +2433,7 @@ Namespace Models Console.ReadLine() End Sub End Class + Public Class ContextAnalyzer Public Class StatementGroup ''' @@ -2552,6 +2560,7 @@ Namespace Models Return statementGroups End Function End Class + Public Class PronounResolver Public Function ResolvePronoun(sentence As String, pronoun As String) As String ' Tokenize the sentence into words @@ -2683,6 +2692,286 @@ Namespace Models Return False End Function End Class + ''' + ''' Grammatical person refers to the degree of involvement of a participant in an action, event, or circumstance. + ''' There are three degrees of grammatical person: + ''' first person (the speaker), + ''' second person (someone being spoken to), + ''' and third person (anyone/anything not being directly addressed). + ''' + + Public Class GramaticalPerson + Private Shared FirstPersonProNouns As List(Of String) = New List(Of String)({" I ", " ME ", " MY", " MINE", " MYSELF", "I ", " US", " OUR", " OURS"}) + Private Shared SecondPersonProNouns As List(Of String) = New List(Of String)({" YOU ", " YOUR ", " YOURSELF ", " YOURSELFS", " YOURSELVES"}) + Private Shared ThirdPersonProNouns As List(Of String) = New List(Of String)({"he", "him", " his", " himself", " she", " her", " hers", " herself", " it", " its", " itself", " they", "them", "their", "theirs", "themselves"}) + + ''' + ''' Grammatical person refers to the degree of involvement of a participant in an action, event, or circumstance. + ''' There are three degrees of grammatical person: + ''' first person (the speaker), + ''' second person (someone being spoken to), + ''' and third person (anyone/anything not being directly addressed). + ''' + Public Enum PerspectivePerson + First_Person_ME = 0 + Second_Person_YOU = 1 + Third_Person_THEM = 2 + NOBODY = 4 + End Enum + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' + Public Enum PerspectiveCase + PersonalSubject = 0 + PersonalObject = 1 + PersonalPosessive = 2 + NOBODY = 3 + End Enum + +#Region "Perspective Person" + + ''' + ''' checks list if it contains item + ''' + ''' + ''' + ''' + Private Shared Function DETECT_PERSPECTIVE(ByRef UserSentence As String, Lst As List(Of String)) As Boolean + DETECT_PERSPECTIVE = False + For Each item In Lst + If UCase(UserSentence).Contains(UCase(item)) Then Return True + Next + End Function + + ''' + ''' RETURNS THE SUBJECT PERSPECTIVE + ''' ''' IE: + ''' "ME" - 1ST PERSON - + ''' "YOU" - SECOND PERSON - + ''' "THEM" - 3RD PERSON - + ''' "NOBODY" NO PERSPECTIVE + ''' + ''' + ''' + Public Shared Function GetGramiticalPersonStr(ByRef UserInputStr As String) As PerspectivePerson + If DETECT_1ST_PERSON(UserInputStr) = True Then Return PerspectivePerson.First_Person_ME + If DETECT_2ND_PERSON(UserInputStr) = True Then Return PerspectivePerson.Second_Person_YOU + If DETECT_3RD_PERSON(UserInputStr) = True Then Return PerspectivePerson.Third_Person_THEM + Return PerspectivePerson.NOBODY + End Function + + ''' + ''' First person definition: first person indicates the speaker. + ''' First person is the I/we perspective. + ''' We, us, our,and ourselves are all first-person pronouns. + ''' Specifically, they are plural first-person pronouns. + ''' Singular first-person pronouns include I, me, my, mine and myself. + ''' + ''' + Public Shared Function DETECT_1ST_PERSON(ByRef UserSentence As String) As Boolean + DETECT_1ST_PERSON = False + If DETECT_PERSPECTIVE(UserSentence, FirstPersonProNouns) = True Then Return True + End Function + + ''' + ''' Second person definition: second person indicates the addressee. + ''' Second person is the you perspective. + ''' The second-person point of view belongs to the person (or people) being addressed. + ''' This is the “you” perspective. + ''' the biggest indicator of the second person is the use of second-person pronouns: + ''' you, your, yours, yourself, yourselves. + ''' + ''' + Public Shared Function DETECT_2ND_PERSON(ByRef UserSentence As String) As Boolean + DETECT_2ND_PERSON = False + If DETECT_PERSPECTIVE(UserSentence, SecondPersonProNouns) = True Then Return True + End Function + + ''' + ''' Third person definition: third person indicates a third party individual other than the speaker. + ''' Third person is the he/she/it/they perspective. + ''' The third-person point of view belongs to the person (or people) being talked about. + ''' The third-person pronouns include + ''' he, him, his, himself, she, her, hers, herself, it, its, itself, they, them, their, theirs, and themselves. + ''' + ''' + Public Shared Function DETECT_3RD_PERSON(ByRef UserSentence As String) As Boolean + DETECT_3RD_PERSON = False + If DETECT_PERSPECTIVE(UserSentence, ThirdPersonProNouns) = True Then Return True + End Function + + ''' + ''' Returns detected Pronoun indicator + ''' + ''' + ''' + Public Shared Function GetDetectedPersonalProNoun(ByRef Userinput As String) As String + Dim lst As New List(Of String) + lst.AddRange(FirstPersonProNouns) + lst.AddRange(SecondPersonProNouns) + lst.AddRange(ThirdPersonProNouns) + For Each item In lst + If Userinput.Contains(UCase(item)) Then Return UCase(item) + Next + Return "" + End Function + +#End Region + +#Region "Case-The cases Of pronouns tell you how they are being used In a sentence." + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' SUBJECT / OBJECT or POSSESSION - or NONE + ''' + ''' + Public Function CheckCase(ByRef Userinput As String) As PerspectiveCase + CheckCase = PerspectiveCase.NOBODY + If CheckPersonalSubject(Userinput) = True Then + Return PerspectiveCase.PersonalSubject + Else + If CheckPersonalObject(Userinput) = True Then + Return PerspectiveCase.PersonalObject + If CheckPersonalPossession(Userinput) = True Then + Return PerspectiveCase.PersonalPosessive + End If + End If + End If + + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' + Public Shared Function CheckPersonalSubject(ByRef Userinput As String) As Boolean + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" I", " WE"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOU", " US"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HE", " SHE", " IT"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return True + Else + Return False + End If + + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' + Public Shared Function CheckPersonalObject(ByRef Userinput As String) As Boolean + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({"ME"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({"YOU", "US"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIM", " HER", " IT"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return True + Else + Return False + End If + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' + Public Shared Function CheckPersonalPossession(ByRef Userinput As String) As Boolean + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" MY", " MINE", " OUR", " OURS"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOUR", " YOURS"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIS", " HER", " HES", " HE IS", " HERS", " THEIR", " THEIRS"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or + DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return True + Else + Return False + End If + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' "ME" - 1ST PERSON - + ''' "YOU" - SECOND PERSON - + ''' "THEM" - 3RD PERSON - + ''' "NOBODY" NO PERSPECTIVE + ''' + Public Shared Function GetPersonalSubject(ByRef Userinput As String) As String + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" I", " WE"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOU", " US"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HE", " SHE", " IT"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then + Return "ME" + Else + If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then + Return "YOU" + Else + If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return "THEM" + Else + Return "NOBODY" + End If + End If + End If + + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' "ME" - 1ST PERSON - + ''' "YOU" - SECOND PERSON - + ''' "THEM" - 3RD PERSON - + ''' "NOBODY" NO PERSPECTIVE + ''' + Public Shared Function GetPersonalObject(ByRef Userinput As String) As String + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({"ME"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({"YOU", "US"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIM", " HER", " IT"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then + Return "ME" + Else + If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then + Return "YOU" + Else + If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return "THEM" + Else + Return "NOBODY" + End If + End If + End If + End Function + + ''' + ''' The cases Of pronouns tell you how they are being used In a sentence. + ''' "ME" - 1ST PERSON - + ''' "YOU" - SECOND PERSON - + ''' "THEM" - 3RD PERSON - + ''' "NOBODY" NO PERSPECTIVE + ''' + Public Shared Function GetPersonalPossession(ByRef Userinput As String) As String + Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" MY", " MINE", " OUR", " OURS"}) + Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOUR", " YOURS"}) + Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIS", " HER", " HES", " HE IS", " HERS", " THEIR", " THEIRS"}) + If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then + Return "ME" + Else + If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then + Return "YOU" + Else + If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then + Return "THEM" + Else + Return "NOBODY" + End If + End If + End If + End Function + +#End Region + + End Class End Namespace End Namespace \ No newline at end of file diff --git a/SourceCode/EntityModels.vb b/SourceCode/EntityModels.vb index 2096223..91021fe 100644 --- a/SourceCode/EntityModels.vb +++ b/SourceCode/EntityModels.vb @@ -1,5 +1,4 @@ Imports System.IO -Imports System.Runtime.CompilerServices Imports System.Text.RegularExpressions Imports System.Web.Script.Serialization Imports System.Windows.Forms @@ -388,6 +387,7 @@ Namespace Models NewEntSentence.DiscoveredEntitys = DetectEntitys(Text, Entitylist) NewEntSentence.EntitySearchPattern = CaptureEntitySentencePatterns(Text, Entitylist) NewEntSentence.EntitySentence = CaptureEntitySentences(Text, Entitylist) + Return NewEntSentence End Function Public Structure DiscoveredEntitys @@ -1239,12 +1239,14 @@ Namespace Models End Class End Class + Public Class RuleBasedEntityRecognizer Private Shared entityPatterns As Dictionary(Of String, String) ''' ''' Represents a captured word and its associated information. ''' + Public Structure CapturedWord ''' ''' The captured word. @@ -1521,6 +1523,7 @@ Namespace Models Return capturedEntities End Function End Class + Public Class EntityLoader Public EntityList As List(Of Entity) Public EntityTypes As List(Of String) @@ -1603,6 +1606,7 @@ Namespace Models End Function End Class + Public Structure DiscoveredEntity ''' @@ -1644,10 +1648,10 @@ Namespace Models End Structure - ' New property for relationships ' New structure to represent entity relationships + Public Structure EntityRelationship Public Property RelationshipType As String Public Property Sentence As String @@ -1670,7 +1674,6 @@ Namespace Models End Function End Structure - Public Structure EntityType Dim Type As String @@ -1695,8 +1698,15 @@ Namespace Models End Namespace Namespace DataObjects - - Public Structure AnswerType + Enum ConclusionTypes + Affirmative_Conclusion + Conditional_Conclusion + Negative_Conclusion + Recommendation_Conclusion + Prediction_Conclusion + End Enum + + Public Class AnswerType Public Sub New(ByVal type As String, ByVal entities As List(Of String)) Me.Type = type @@ -1705,8 +1715,8 @@ Namespace DataObjects Public Property Entities As List(Of String) Public Property Type As String - End Structure - + End Class + Public Structure CapturedContent Public Sub New(ByVal word As String, ByVal precedingWords As List(Of String), ByVal followingWords As List(Of String)) Me.Word = word @@ -1719,7 +1729,7 @@ Namespace DataObjects Public Property PrecedingWords As List(Of String) Public Property Word As String End Structure - + Public Structure CapturedWord Public Sub New(ByVal word As String, ByVal precedingWords As List(Of String), ByVal followingWords As List(Of String), ByVal person As String, ByVal location As String) Me.Word = word @@ -1767,7 +1777,7 @@ Namespace DataObjects Public Property PrecedingWords As List(Of String) Public Property Word As String End Structure - + Public Structure NlpReport Public EntityLists As List(Of Entity) @@ -1785,6 +1795,7 @@ Namespace DataObjects ''' Used to retrieve Learning Patterns ''' Learning Pattern / Nym ''' + Public Structure SemanticPattern ''' @@ -1795,7 +1806,7 @@ Namespace DataObjects ''' ''' Used to hold the connection string ''' - Public ConectionStr As String + Public ConnectionStr As String ''' ''' used to identify patterns @@ -1807,6 +1818,11 @@ Namespace DataObjects ''' Public SearchPatternStr As String + Public Sub New(ConnectionStr As String) + Me.New() + Me.ConnectionStr = ConnectionStr + End Sub + ''' ''' filters collection of patterns by nym ''' @@ -1857,6 +1873,35 @@ Namespace DataObjects End Using Return DbSubjectLst End Function + ''' + ''' Gets all Semantic Patterns From Table + ''' + ''' + ''' + Public Function GetDBSemanticPatterns(ByRef TableName As String) As List(Of SemanticPattern) + Dim DbSubjectLst As New List(Of SemanticPattern) + + Dim SQL As String = "SELECT * FROM " & TableName + Using conn = New System.Data.OleDb.OleDbConnection(ConnectionStr) + Using cmd = New System.Data.OleDb.OleDbCommand(SQL, conn) + conn.Open() + Try + Dim dr = cmd.ExecuteReader() + While dr.Read() + Dim NewKnowledge As New SemanticPattern With { + .NymStr = dr("Nym").ToString(), + .SearchPatternStr = dr("SemanticPattern").ToString() + } + DbSubjectLst.Add(NewKnowledge) + End While + Catch e As Exception + ' Do some logging or something. + MessageBox.Show("There was an error accessing your data. GetDBSemanticPatterns: " & e.ToString()) + End Try + End Using + End Using + Return DbSubjectLst + End Function ''' ''' gets semantic patterns from table based on query SQL @@ -1888,6 +1933,35 @@ Namespace DataObjects End Using Return DbSubjectLst End Function + ''' + ''' gets semantic patterns from table based on query SQL + ''' + ''' + ''' + Public Function GetDBSemanticPatternsbyQuery(ByRef Query As String) As List(Of SemanticPattern) + Dim DbSubjectLst As New List(Of SemanticPattern) + + Dim SQL As String = Query + Using conn = New System.Data.OleDb.OleDbConnection(ConnectionStr) + Using cmd = New System.Data.OleDb.OleDbCommand(SQL, conn) + conn.Open() + Try + Dim dr = cmd.ExecuteReader() + While dr.Read() + Dim NewKnowledge As New SemanticPattern With { + .NymStr = dr("Nym").ToString(), + .SearchPatternStr = dr("SemanticPattern").ToString() + } + DbSubjectLst.Add(NewKnowledge) + End While + Catch e As Exception + ' Do some logging or something. + MessageBox.Show("There was an error accessing your data. GetDBSemanticPatterns: " & e.ToString()) + End Try + End Using + End Using + Return DbSubjectLst + End Function ''' ''' gets random pattern from list @@ -1922,7 +1996,7 @@ Namespace DataObjects ''' Adds a New Semantic pattern ''' ''' - Public Function AddSemanticPattern(ByRef iConnectionStr As String, ByRef Tablename As String, ByRef NewSemanticPattern As SemanticPattern) As Boolean + Public Shared Function AddSemanticPattern(ByRef iConnectionStr As String, ByRef Tablename As String, ByRef NewSemanticPattern As SemanticPattern) As Boolean AddSemanticPattern = False If NewSemanticPattern.NymStr IsNot Nothing And NewSemanticPattern.SearchPatternStr IsNot Nothing Then @@ -1943,8 +2017,32 @@ Namespace DataObjects Else End If End Function + ''' + ''' Adds a New Semantic pattern + ''' + ''' + Public Function AddSemanticPattern(ByRef Tablename As String, ByRef NewSemanticPattern As SemanticPattern) As Boolean + AddSemanticPattern = False + If NewSemanticPattern.NymStr IsNot Nothing And NewSemanticPattern.SearchPatternStr IsNot Nothing Then + + Dim sql As String = "INSERT INTO " & Tablename & " (Nym, SemanticPattern) VALUES ('" & NewSemanticPattern.NymStr & "','" & NewSemanticPattern.SearchPatternStr & "')" - Public Function CheckIfSemanticPatternDetected(ByRef iConnectionStr As String, ByRef TableName As String, ByRef Userinput As String) As Boolean + Using conn = New System.Data.OleDb.OleDbConnection(ConnectionStr) + + Using cmd = New System.Data.OleDb.OleDbCommand(sql, conn) + conn.Open() + Try + cmd.ExecuteNonQuery() + AddSemanticPattern = True + Catch ex As Exception + MessageBox.Show("There was an error accessing your data. AddSemanticPattern: " & ex.ToString()) + End Try + End Using + End Using + Else + End If + End Function + Public Shared Function CheckIfSemanticPatternDetected(ByRef iConnectionStr As String, ByRef TableName As String, ByRef Userinput As String) As Boolean CheckIfSemanticPatternDetected = False For Each item In InsertWildcardsIntoPatterns(GetDBSemanticPatterns(iConnectionStr, TableName)) If Userinput Like item.SearchPatternStr Then @@ -1952,8 +2050,23 @@ Namespace DataObjects End If Next End Function - - Public Function GetDetectedSemanticPattern(ByRef iConnectionStr As String, ByRef TableName As String, ByRef Userinput As String) As SemanticPattern + Public Function CheckIfSemanticPatternDetected(ByRef TableName As String, ByRef Userinput As String) As Boolean + CheckIfSemanticPatternDetected = False + For Each item In InsertWildcardsIntoPatterns(GetDBSemanticPatterns(ConnectionStr, TableName)) + If Userinput Like item.SearchPatternStr Then + Return True + End If + Next + End Function + Public Function GetDetectedSemanticPattern(ByRef TableName As String, ByRef Userinput As String) As SemanticPattern + GetDetectedSemanticPattern = Nothing + For Each item In InsertWildcardsIntoPatterns(GetDBSemanticPatterns(ConnectionStr, TableName)) + If Userinput Like item.SearchPatternStr Then + Return item + End If + Next + End Function + Public Shared Function GetDetectedSemanticPattern(ByRef iConnectionStr As String, ByRef TableName As String, ByRef Userinput As String) As SemanticPattern GetDetectedSemanticPattern = Nothing For Each item In InsertWildcardsIntoPatterns(GetDBSemanticPatterns(iConnectionStr, TableName)) If Userinput Like item.SearchPatternStr Then @@ -1973,7 +2086,7 @@ Namespace DataObjects End Structure - + Public Structure WordWithContext ''' ''' Gets or sets the context words. @@ -2011,111 +2124,11 @@ Namespace DataObjects Public Property Word As String End Structure End Namespace -Public Module Ext - - ''' - ''' Writes the contents of an embedded resource embedded as Bytes to disk. - ''' - ''' Embedded resource - ''' Save to file - ''' - - Public Sub FileSave(ByVal BytesToWrite() As Byte, ByVal FileName As String) - - If IO.File.Exists(FileName) Then - IO.File.Delete(FileName) - End If - - Dim FileStream As New System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate) - Dim BinaryWriter As New System.IO.BinaryWriter(FileStream) - - BinaryWriter.Write(BytesToWrite) - BinaryWriter.Close() - FileStream.Close() - End Sub - ''' - ''' Extracts words between based on the before and after words - ''' IE: THe cat sat on the mat (before The After The) output: cat sat on - ''' - ''' - ''' - ''' - ''' - - Public Function ExtractWordsBetween(sentence As String, beforeWord As String, afterWord As String) As List(Of String) - Dim words As New List(Of String)() - - Dim sentenceWords As String() = sentence.Split(" "c) - Dim startIndex As Integer = -1 - Dim endIndex As Integer = -1 - - ' Find the starting and ending indices of the target words - For i As Integer = 0 To sentenceWords.Length - 1 - If sentenceWords(i).Equals(beforeWord, StringComparison.OrdinalIgnoreCase) Then - startIndex = i - End If - - If sentenceWords(i).Equals(afterWord, StringComparison.OrdinalIgnoreCase) Then - endIndex = i - End If - Next - - ' Extract words between the target words - If startIndex <> -1 AndAlso endIndex <> -1 AndAlso startIndex < endIndex Then - For i As Integer = startIndex + 1 To endIndex - 1 - words.Add(sentenceWords(i)) - Next - End If - - Return words - End Function - - - Public Function StartsWithAny(str As String, values As IEnumerable(Of String)) As Boolean - For Each value As String In values - If str.StartsWith(value) Then - Return True - End If - Next - - Return False - End Function - - - Public Function StartsWithAny(ByVal input As String, ByVal values As String()) As Boolean - For Each value As String In values - If input.StartsWith(value, StringComparison.OrdinalIgnoreCase) Then - Return True - End If - Next - Return False - End Function - - - Enum ConclusionTypes - Affirmative_Conclusion - Conditional_Conclusion - Negative_Conclusion - Recommendation_Conclusion - Prediction_Conclusion - End Enum - - - Public Function ContainsAny(text As String, indicators As String()) As Boolean - For Each indicator As String In indicators - If text.Contains(indicator) Then - Return True - End If - Next - - Return False - End Function - -End Module Namespace Utilitys ' Latent Dirichlet Allocation (LDA) algorithm + Public Class Latent_Dirichlet_Allocation @@ -2126,7 +2139,7 @@ Namespace Utilitys 'End Class - + Public Class WordCount Public Property WordCount As Dictionary(Of String, Integer) diff --git a/SourceCode/Examples.vb b/SourceCode/Examples.vb index 3696e3d..b545b13 100644 --- a/SourceCode/Examples.vb +++ b/SourceCode/Examples.vb @@ -1,6 +1,8 @@ Imports System.IO Imports System.Numerics Imports System.Windows.Forms +Imports InputModelling.LanguageModels.BaseModels.LanguageModelFactory +Imports InputModelling.LanguageModels.BaseModels.LanguageModelFactory.Corpus.Vocabulary Imports InputModelling.Models Imports InputModelling.Models.Chunkers Imports InputModelling.Models.Embeddings @@ -16,13 +18,80 @@ Imports InputModelling.Models.Entailment.SentenceClassifier Imports InputModelling.Models.EntityModel Imports InputModelling.Models.Nodes Imports InputModelling.Models.Readers -Imports InputModelling.Models.Tokenizers +Imports InputModelling.Models.TokenizerModels Imports InputModelling.Models.Trees.BeliefTree Imports InputModelling.Models.VocabularyModelling Imports InputModelling.Utilitys Namespace Examples + Public Module Example + Public Function iLangModelTrainTest() As iLangModel.FeedForwardNetwork + ' Create the input and target training data + Dim inputs As New List(Of List(Of Double))() + Dim targets As New List(Of List(Of Double))() + + ' AND logic gate training data + inputs.Add(New List(Of Double)() From {0, 0}) + inputs.Add(New List(Of Double)() From {0, 1}) + inputs.Add(New List(Of Double)() From {1, 0}) + inputs.Add(New List(Of Double)() From {1, 1}) + + targets.Add(New List(Of Double)() From {0}) + targets.Add(New List(Of Double)() From {0}) + targets.Add(New List(Of Double)() From {0}) + targets.Add(New List(Of Double)() From {1}) + + ' Create a feed-forward neural network with 2 input neurons, 2 hidden neurons, and 1 output neuron + Dim network As New iLangModel.FeedForwardNetwork(inputSize:=2, hiddenSize:=2, outputSize:=1) + + ' Train the network using the training data for 100 epochs with a learning rate of 0.1 + network.Train(inputs, targets, epochs:=100, learningRate:=0.1) + + ' Test the trained network + Console.WriteLine("Testing the trained network:") + + For i As Integer = 0 To inputs.Count - 1 + Dim inputVector As List(Of Double) = inputs(i) + Dim targetVector As List(Of Double) = targets(i) + + Dim outputVector = network.Forward(inputs) + + Console.WriteLine("Input: {0}, Target: {1}, Output: {2}", String.Join(", ", inputVector), String.Join(", ", targetVector), String.Join(", ", outputVector)) + Next + + Return network + End Function + Public Sub IlangModelExample() + ' Create an instance of the FeedForwardNetwork + Dim feedForwardNN As iLangModel.FeedForwardNetwork = iLangModelTrainTest() + + ' Define the input sequence for the logical AND operation + Dim inputSequence As List(Of List(Of Double)) = New List(Of List(Of Double))() From + { + New List(Of Double)() From {0, 0}, + New List(Of Double)() From {0, 1}, + New List(Of Double)() From {1, 0}, + New List(Of Double)() From {1, 1} + } + + ' Apply the forward pass to get the predicted outputs + Dim output As List(Of List(Of Double)) = feedForwardNN.Forward(inputSequence) + + ' Display the input sequence and predicted outputs + Console.WriteLine("Input Sequence:") + For Each inputVector As List(Of Double) In inputSequence + Console.WriteLine(String.Join(", ", inputVector)) + Next + + Console.WriteLine("Predicted Outputs:") + For Each outputVector As List(Of Double) In output + Console.WriteLine(Math.Round(outputVector(0))) ' Round the output to the nearest integer (0 or 1) + Next + + Console.ReadLine() + End Sub + Public Sub Word2VectorExample() Dim stopwords As List(Of String) = New List(Of String) From {"this", "own", "to", "is", "a", "with", "on", "is", "at", "they", "and", "the", "are", "for"} @@ -170,7 +239,66 @@ Namespace Examples Console.WriteLine($" Expected Monetary Value: {optionB.ExpectedMonetaryValue}") Console.WriteLine($" Regret: {optionB.Regret}") End Sub + Public Sub iCorpusExample() + 'Create Vocabulary + Dim iCorpus As String = "the quick brown fox, jumped over the lazy dog." + Dim NewVocabulary = Corpus.Vocabulary.CreateVocabulary(iCorpus, Corpus.Vocabulary.VocabularyType.Word) + Console.WriteLine("vocabulary List: ") + Dim str As String = "" + For Each item In NewVocabulary + str &= "entry :" & item.Text & vbTab & "Value :" & item.Encoding & vbNewLine + + Next + Console.WriteLine(str) + 'Encode InputText + Dim InputText As String = iCorpus + + Dim InputLayer As New InputTextRecord + InputLayer.Text = iCorpus + Console.WriteLine("Input layer: ") + InputLayer.Encoding = Encode.Encode_Text(InputText, NewVocabulary, VocabularyType.Word) + Console.WriteLine("Input Text: " & "[" & InputLayer.Text & "]" & vbNewLine) + Console.WriteLine("Input Embedding: ") + str = "[" + For Each item In InputLayer.Encoding + str &= item & " " + Next + str &= "] " + Console.WriteLine(str) + Console.WriteLine(vbNewLine) + 'get inputs + InputLayer.blocksize = 4 + InputLayer.Inputblocks = InputTextRecord.GetBlocks(InputLayer.Encoding, InputLayer.blocksize) + Console.WriteLine("Input BlockSize: " & InputLayer.blocksize) + Console.WriteLine("Input Blocks ") + For Each lst In InputLayer.Inputblocks + + Dim block As String = "" + For Each item In lst + block &= item & " " + Next + Console.WriteLine("[" & block & "]") + Next + Console.WriteLine(vbNewLine) + Dim ofset = 1 + 'get targets(add ofset to get targets further in the future ofset < blocksize) + + InputLayer.Targetblocks = InputTextRecord.GetTargetBlocks(InputLayer.Encoding, InputLayer.blocksize) + Console.WriteLine("Target BlockSize: " & InputLayer.blocksize) + Console.WriteLine("Target ofset : " & ofset) + Console.WriteLine("Target Blocks ") + For Each lst In InputLayer.Targetblocks + + Dim block As String = "" + For Each item In lst + block &= item & " " + Next + Console.WriteLine("[" & block & "]") + Next + Console.ReadLine() + + End Sub Sub MedicalDiagnosisExample() ' Define nodes and states @@ -1303,7 +1431,7 @@ Namespace Examples Dim sentences As New List(Of String) From { "I love apples.", "Bananas are tasty."} - Dim Tokenizer As New Tokenizer + Dim Tokenizer As New Advanced For Each item In Corpus Tokenizer.Train(item, 5) Next diff --git a/SourceCode/InputModels.vb b/SourceCode/InputModels.vb index c986a29..1a840f1 100644 --- a/SourceCode/InputModels.vb +++ b/SourceCode/InputModels.vb @@ -1,13 +1,16 @@ Imports System.Drawing Imports System.IO +Imports System.Runtime.CompilerServices +Imports System.Runtime.Serialization.Formatters.Binary Imports System.Text.RegularExpressions Imports System.Web.Script.Serialization Imports System.Windows.Forms Imports InputModelling.LanguageModels Imports InputModelling.Models.EntityModel Imports InputModelling.Models.MatrixModels +Imports InputModelling.Models.Nodes Imports InputModelling.Models.Readers -Imports InputModelling.Models.Tokenizers +Imports InputModelling.Models.TokenizerModels Imports InputModelling.Models.VocabularyModelling Imports InputModelling.Utilitys Imports Newtonsoft.Json @@ -16,6 +19,7 @@ Imports Newtonsoft.Json Namespace Models Namespace Chunkers + Public Class TextCorpusChunker Implements ICorpusChunker @@ -64,6 +68,7 @@ Namespace Models ''' ''' Separates text into groups(chunks) ''' + Public Class TextChunking ''' ''' a Function In VB which takes a text And splits it Into chunks Of X length @@ -113,286 +118,6 @@ Namespace Models End Function - ''' - ''' Grammatical person refers to the degree of involvement of a participant in an action, event, or circumstance. - ''' There are three degrees of grammatical person: - ''' first person (the speaker), - ''' second person (someone being spoken to), - ''' and third person (anyone/anything not being directly addressed). - ''' - Public Class GramaticalPerson - Private Shared FirstPersonProNouns As List(Of String) = New List(Of String)({" I ", " ME ", " MY", " MINE", " MYSELF", "I ", " US", " OUR", " OURS"}) - Private Shared SecondPersonProNouns As List(Of String) = New List(Of String)({" YOU ", " YOUR ", " YOURSELF ", " YOURSELFS", " YOURSELVES"}) - Private Shared ThirdPersonProNouns As List(Of String) = New List(Of String)({"he", "him", " his", " himself", " she", " her", " hers", " herself", " it", " its", " itself", " they", "them", "their", "theirs", "themselves"}) - - ''' - ''' Grammatical person refers to the degree of involvement of a participant in an action, event, or circumstance. - ''' There are three degrees of grammatical person: - ''' first person (the speaker), - ''' second person (someone being spoken to), - ''' and third person (anyone/anything not being directly addressed). - ''' - Public Enum PerspectivePerson - First_Person_ME = 0 - Second_Person_YOU = 1 - Third_Person_THEM = 2 - NOBODY = 4 - End Enum - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' - Public Enum PerspectiveCase - PersonalSubject = 0 - PersonalObject = 1 - PersonalPosessive = 2 - NOBODY = 3 - End Enum - -#Region "Perspective Person" - - ''' - ''' checks list if it contains item - ''' - ''' - ''' - ''' - Private Shared Function DETECT_PERSPECTIVE(ByRef UserSentence As String, Lst As List(Of String)) As Boolean - DETECT_PERSPECTIVE = False - For Each item In Lst - If UCase(UserSentence).Contains(UCase(item)) Then Return True - Next - End Function - - ''' - ''' RETURNS THE SUBJECT PERSPECTIVE - ''' ''' IE: - ''' "ME" - 1ST PERSON - - ''' "YOU" - SECOND PERSON - - ''' "THEM" - 3RD PERSON - - ''' "NOBODY" NO PERSPECTIVE - ''' - ''' - ''' - Public Shared Function GetGramiticalPersonStr(ByRef UserInputStr As String) As PerspectivePerson - If DETECT_1ST_PERSON(UserInputStr) = True Then Return PerspectivePerson.First_Person_ME - If DETECT_2ND_PERSON(UserInputStr) = True Then Return PerspectivePerson.Second_Person_YOU - If DETECT_3RD_PERSON(UserInputStr) = True Then Return PerspectivePerson.Third_Person_THEM - Return PerspectivePerson.NOBODY - End Function - - ''' - ''' First person definition: first person indicates the speaker. - ''' First person is the I/we perspective. - ''' We, us, our,and ourselves are all first-person pronouns. - ''' Specifically, they are plural first-person pronouns. - ''' Singular first-person pronouns include I, me, my, mine and myself. - ''' - ''' - Public Shared Function DETECT_1ST_PERSON(ByRef UserSentence As String) As Boolean - DETECT_1ST_PERSON = False - If DETECT_PERSPECTIVE(UserSentence, FirstPersonProNouns) = True Then Return True - End Function - - ''' - ''' Second person definition: second person indicates the addressee. - ''' Second person is the you perspective. - ''' The second-person point of view belongs to the person (or people) being addressed. - ''' This is the “you” perspective. - ''' the biggest indicator of the second person is the use of second-person pronouns: - ''' you, your, yours, yourself, yourselves. - ''' - ''' - Public Shared Function DETECT_2ND_PERSON(ByRef UserSentence As String) As Boolean - DETECT_2ND_PERSON = False - If DETECT_PERSPECTIVE(UserSentence, SecondPersonProNouns) = True Then Return True - End Function - - ''' - ''' Third person definition: third person indicates a third party individual other than the speaker. - ''' Third person is the he/she/it/they perspective. - ''' The third-person point of view belongs to the person (or people) being talked about. - ''' The third-person pronouns include - ''' he, him, his, himself, she, her, hers, herself, it, its, itself, they, them, their, theirs, and themselves. - ''' - ''' - Public Shared Function DETECT_3RD_PERSON(ByRef UserSentence As String) As Boolean - DETECT_3RD_PERSON = False - If DETECT_PERSPECTIVE(UserSentence, ThirdPersonProNouns) = True Then Return True - End Function - - ''' - ''' Returns detected Pronoun indicator - ''' - ''' - ''' - Public Shared Function GetDetectedPersonalProNoun(ByRef Userinput As String) As String - Dim lst As New List(Of String) - lst.AddRange(FirstPersonProNouns) - lst.AddRange(SecondPersonProNouns) - lst.AddRange(ThirdPersonProNouns) - For Each item In lst - If Userinput.Contains(UCase(item)) Then Return UCase(item) - Next - Return "" - End Function - -#End Region - -#Region "Case-The cases Of pronouns tell you how they are being used In a sentence." - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' SUBJECT / OBJECT or POSSESSION - or NONE - ''' - ''' - Public Function CheckCase(ByRef Userinput As String) As PerspectiveCase - CheckCase = PerspectiveCase.NOBODY - If CheckPersonalSubject(Userinput) = True Then - Return PerspectiveCase.PersonalSubject - Else - If CheckPersonalObject(Userinput) = True Then - Return PerspectiveCase.PersonalObject - If CheckPersonalPossession(Userinput) = True Then - Return PerspectiveCase.PersonalPosessive - End If - End If - End If - - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' - Public Shared Function CheckPersonalSubject(ByRef Userinput As String) As Boolean - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" I", " WE"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOU", " US"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HE", " SHE", " IT"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return True - Else - Return False - End If - - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' - Public Shared Function CheckPersonalObject(ByRef Userinput As String) As Boolean - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({"ME"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({"YOU", "US"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIM", " HER", " IT"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return True - Else - Return False - End If - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' - Public Shared Function CheckPersonalPossession(ByRef Userinput As String) As Boolean - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" MY", " MINE", " OUR", " OURS"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOUR", " YOURS"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIS", " HER", " HES", " HE IS", " HERS", " THEIR", " THEIRS"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Or - DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return True - Else - Return False - End If - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' "ME" - 1ST PERSON - - ''' "YOU" - SECOND PERSON - - ''' "THEM" - 3RD PERSON - - ''' "NOBODY" NO PERSPECTIVE - ''' - Public Shared Function GetPersonalSubject(ByRef Userinput As String) As String - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" I", " WE"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOU", " US"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HE", " SHE", " IT"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then - Return "ME" - Else - If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then - Return "YOU" - Else - If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return "THEM" - Else - Return "NOBODY" - End If - End If - End If - - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' "ME" - 1ST PERSON - - ''' "YOU" - SECOND PERSON - - ''' "THEM" - 3RD PERSON - - ''' "NOBODY" NO PERSPECTIVE - ''' - Public Shared Function GetPersonalObject(ByRef Userinput As String) As String - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({"ME"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({"YOU", "US"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIM", " HER", " IT"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then - Return "ME" - Else - If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then - Return "YOU" - Else - If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return "THEM" - Else - Return "NOBODY" - End If - End If - End If - End Function - - ''' - ''' The cases Of pronouns tell you how they are being used In a sentence. - ''' "ME" - 1ST PERSON - - ''' "YOU" - SECOND PERSON - - ''' "THEM" - 3RD PERSON - - ''' "NOBODY" NO PERSPECTIVE - ''' - Public Shared Function GetPersonalPossession(ByRef Userinput As String) As String - Dim mFirstPersonProNouns As List(Of String) = New List(Of String)({" MY", " MINE", " OUR", " OURS"}) - Dim mSecondPersonProNouns As List(Of String) = New List(Of String)({" YOUR", " YOURS"}) - Dim mThirdPersonProNouns As List(Of String) = New List(Of String)({" HIS", " HER", " HES", " HE IS", " HERS", " THEIR", " THEIRS"}) - If DETECT_PERSPECTIVE(Userinput, mFirstPersonProNouns) = True Then - Return "ME" - Else - If DETECT_PERSPECTIVE(Userinput, mSecondPersonProNouns) = True Then - Return "YOU" - Else - If DETECT_PERSPECTIVE(Userinput, mThirdPersonProNouns) = True Then - Return "THEM" - Else - Return "NOBODY" - End If - End If - End If - End Function - -#End Region - - End Class - ''' ''' counts occurrences of a specific phoneme ''' @@ -500,7 +225,6 @@ PROC_ERR: End Class - Public Interface ICorpusChunker Function FilterUsingPunctuationVocabulary(data As List(Of String)) As List(Of String) @@ -512,6 +236,7 @@ PROC_ERR: Function ProcessTextData(rawData As String, useFiltering As Boolean) As List(Of String) End Interface + Public Class ChunkProcessor Private chunkType As ChunkType Private maxSize As Integer @@ -570,1332 +295,183 @@ PROC_ERR: Public Shared Sub OutputToJSON(data As List(Of String), outputPath As String) Dim jsonData As New List(Of Object) - For Each chunk As String In data - jsonData.Add(New With {.content = chunk}) - Next - Dim jsonText As String = JsonConvert.SerializeObject(jsonData, Formatting.Indented) - File.WriteAllText(outputPath, jsonText) - End Sub - - Public Shared Sub OutputToListOfLists(data As List(Of String), outputPath As String) - File.WriteAllLines(outputPath, data) - End Sub - - Public Shared Sub OutputToStructured(entityChunks As List(Of KeyValuePair(Of String, String)), outputPath As String) - Dim structuredData As New List(Of Object) - For Each entityChunk As KeyValuePair(Of String, String) In entityChunks - structuredData.Add(New With { - .entityType = entityChunk.Key, - .content = entityChunk.Value - }) - Next - Dim jsonText As String = JsonConvert.SerializeObject(structuredData, Formatting.Indented) - File.WriteAllText(outputPath, jsonText) - End Sub - - Public Shared Function ProcessFile(inputPath As String, outputDirectory As String, entityListfilePath As String, maxSize As Integer, useFiltering As Boolean, chunkType As ChunkType) As List(Of String) - Dim rawData As String = File.ReadAllText(inputPath) - Dim chunks As List(Of String) = Chunk(rawData, chunkType, maxSize) - - ' Load entity list if filtering is selected - If useFiltering Then - Dim filterList = EntityLoader.LoadEntityListFromFile(entityListfilePath) - - ' Detect and output structured entities - Dim entityChunks As List(Of KeyValuePair(Of String, String)) = EntityLoader.DetectEntities(chunks, filterList) - OutputToStructured(entityChunks, Path.Combine(outputDirectory, "entity_output.txt")) - End If - If maxSize > 0 Then - ' Apply padding based on maxSize - chunks = ApplyPadding(chunks, maxSize) - Else - End If - - ' Output to different formats - OutputToListOfLists(chunks, Path.Combine(outputDirectory, "output.txt")) - OutputToCSV(chunks, Path.Combine(outputDirectory, "output.csv")) - OutputToJSON(chunks, Path.Combine(outputDirectory, "output.json")) - - ' Create punctuation vocabulary - Return chunks - End Function - - Public Function ApplyFiltering(chunks As List(Of String), filterList As List(Of KeyValuePair(Of String, String))) As List(Of String) - Dim filteredChunks As New List(Of String) - - For Each chunk As String In chunks - For Each filterItem As KeyValuePair(Of String, String) In filterList - If chunk.Contains(filterItem.Value) Then - filteredChunks.Add(chunk) - Exit For - End If - Next - Next - - Return filteredChunks - End Function - - Public Function ApplyPadding(chunks As List(Of String)) As List(Of String) - ' Padding logic for text data chunks - Dim paddedChunks As New List(Of String) - - For Each chunk As String In chunks - If chunk.Length > maxSize Then - ' Apply padding if chunk size exceeds maxSize - paddedChunks.Add(chunk.Substring(0, maxSize)) - Else - paddedChunks.Add(chunk) - End If - Next - - Return paddedChunks - End Function - - Public Function Chunk(data As String, chunkType As ChunkType) As List(Of String) - ' Chunking logic for text data based on chunkType - Dim chunks As New List(Of String) - - Select Case chunkType - Case ChunkType.Sentence - ' Split into sentences - chunks.AddRange(data.Split("."c)) - Case ChunkType.Paragraph - ' Split into paragraphs - chunks.AddRange(data.Split(Environment.NewLine)) - Case ChunkType.Document - ' Treat the whole data as a document - chunks.Add(data) - End Select - If maxSize > 0 Then - ' Apply padding based on maxSize - chunks = ApplyPadding(chunks) - End If - - Return chunks - End Function - - Public Function CustomizeChunkingAndPadding(data As String) As List(Of String) - Dim chunks As List(Of String) = Chunk(data, chunkType) - - If maxSize > 0 Then - chunks = ApplyPadding(chunks) - End If - - Return chunks - End Function - - ''' - ''' Filters out chunks containing specific punctuation marks or symbols. - ''' - ''' The list of processed text data chunks. - ''' A list of filtered text data chunks. - Public Function FilterUsingPunctuationVocabulary(data As List(Of String), ByRef punctuationVocabulary As HashSet(Of String)) As List(Of String) - Dim filteredData As New List(Of String) - - For Each chunk As String In data - Dim symbols As String() = chunk.Split().Where(Function(token) Not Char.IsLetterOrDigit(token(0))).ToArray() - - Dim containsPunctuation As Boolean = False - For Each symbol As String In symbols - If punctuationVocabulary.Contains(symbol) Then - containsPunctuation = True - Exit For - End If - Next - - If Not containsPunctuation Then - filteredData.Add(chunk) - End If - Next - - Return filteredData - End Function - - Public Sub ProcessAndFilterChunks(inputPath As String, outputPath As String, filterListPath As String, chunkType As ChunkType, maxSize As Integer) - Dim rawData As String = File.ReadAllText(inputPath) - Dim chunks As List(Of String) = Chunk(rawData, chunkType, maxSize) - - If Not String.IsNullOrEmpty(filterListPath) Then - Dim filterList As List(Of KeyValuePair(Of String, String)) = EntityLoader.LoadEntityListFromFile(filterListPath) - chunks = ApplyFiltering(chunks, filterList) - End If - - ' Apply padding if maxSize is specified - If maxSize > 0 Then - chunks = ApplyPadding(chunks, maxSize) - End If - - ' Output to different formats - OutputToListOfLists(chunks, Path.Combine(outputPath, "output.txt")) - OutputToCSV(chunks, Path.Combine(outputPath, "output.csv")) - OutputToJSON(chunks, Path.Combine(outputPath, "output.json")) - End Sub - - Public Function ProcessFile(inputPath As String, outputDirectory As String) - Dim rawData As String = File.ReadAllText(inputPath) - Dim chunks As List(Of String) = Chunk(rawData, chunkType) - - ' Output to different formats - OutputToListOfLists(chunks, Path.Combine(outputDirectory, "output.txt")) - OutputToCSV(chunks, Path.Combine(outputDirectory, "output.csv")) - OutputToJSON(chunks, Path.Combine(outputDirectory, "output.json")) - Return chunks - End Function - - End Class - End Namespace - Namespace Tokenizers - Public Class TokenizerWordPiece - Private ReadOnly corpus As List(Of String) - Private vocabulary As Dictionary(Of String, Integer) - Private maxVocabSize As Integer - Private ReadOnly maxSubwordLength As Integer - - - - Public Sub New() - End Sub - Public Sub New(corpus As List(Of String)) - Me.corpus = corpus - Me.vocabulary = New Dictionary(Of String, Integer) - Me.maxVocabSize = 1000000 - Me.maxSubwordLength = 20 - End Sub - Public Sub New(corpus As List(Of String), vocabulary As Dictionary(Of String, Integer), maxVocabSize As Integer, maxSubwordLength As Integer) - If corpus Is Nothing Then - Throw New ArgumentNullException(NameOf(corpus)) - End If - - If vocabulary Is Nothing Then - Throw New ArgumentNullException(NameOf(vocabulary)) - End If - - Me.corpus = corpus - Me.vocabulary = vocabulary - Me.maxVocabSize = maxVocabSize - Me.maxSubwordLength = maxSubwordLength - End Sub - Public Sub Train() - Dim subwordCounts As New Dictionary(Of String, Integer) - - ' Count subword occurrences in the corpus - For Each sentence As String In corpus - Dim tokens As List(Of String) = Tokenize(sentence) - - For Each token As String In tokens - If subwordCounts.ContainsKey(token) Then - subwordCounts(token) += 1 - Else - subwordCounts.Add(token, 1) - End If - Next - Next - - ' Sort subwords by frequency and add them to the vocabulary - Dim sortedSubwords = subwordCounts.OrderByDescending(Function(pair) pair.Value) - - For Each pair In sortedSubwords.Take(maxVocabSize) - vocabulary.Add(pair.Key, vocabulary.Count) - Next - End Sub - - - Public Function GetVocabulary() As Dictionary(Of String, Integer) - Return vocabulary - End Function - Public Function Tokenize(text As String) As List(Of String) - Dim tokens As New List(Of String) - Dim index As Integer = 0 - - While index < text.Length - Dim subwordLength As Integer = Math.Min(maxSubwordLength, text.Length - index) - Dim subword As String = text.Substring(index, subwordLength) - - While subwordLength > 0 AndAlso Not vocabulary.ContainsKey(subword) - subwordLength -= 1 - subword = text.Substring(index, subwordLength) - End While - - tokens.Add(subword) - index += subwordLength - End While - - Return tokens - End Function - Public Shared Function CalculateWordPieceFrequency(ByVal subword As String, ByVal mergedWord As String) As Integer - Dim occurrences As Integer = 0 - Dim index As Integer = -1 - - While True - index = mergedWord.IndexOf(subword, index + 1) - If index = -1 Then - Exit While - End If - - ' Check if the found index is part of a valid word (not a subword of another word) - If index = 0 OrElse mergedWord(index - 1) = " "c Then - Dim endIndex As Integer = index + subword.Length - If endIndex = mergedWord.Length OrElse mergedWord(endIndex) = " "c Then - occurrences += 1 - End If - End If - End While - - Return occurrences - End Function - - - End Class - Public Class TokenizerBPE - Public Class BpeSubwordPair - Public Property Subword1 As String - Public Property Subword2 As String - Public Property Frequency As Integer - - Public Sub New(subword1 As String, subword2 As String, frequency As Integer) - Me.Subword1 = subword1 - Me.Subword2 = subword2 - Me.Frequency = frequency - End Sub - End Class - Public Class BpeVocabulary - Inherits Dictionary(Of String, Integer) - End Class - Private Sub New() - ' Private constructor to prevent instantiation without parameters - End Sub - - - - Public Shared Function TrainBpeModel(corpus As List(Of String), numMerges As Integer) As BpeVocabulary - ' Tokenize the corpus at the character level to get the initial vocabulary - Dim characterLevelVocabulary As BpeVocabulary = TokenizeCorpusToCharacterLevel(corpus) - - ' Merge the most frequent pairs of subwords iteratively - For i As Integer = 0 To numMerges - 1 - Dim mostFrequentPair As BpeSubwordPair = FindMostFrequentPair(characterLevelVocabulary) - If mostFrequentPair Is Nothing Then - Exit For - End If - - Dim newSubword As String = mostFrequentPair.Subword1 + mostFrequentPair.Subword2 - characterLevelVocabulary = MergeSubwordPair(characterLevelVocabulary, mostFrequentPair, newSubword) - Next - - Return characterLevelVocabulary - End Function - - Private Shared Function TokenizeCorpusToCharacterLevel(corpus As List(Of String)) As BpeVocabulary - Dim characterLevelVocabulary As New BpeVocabulary() - - For Each document As String In corpus - For Each character As Char In document - Dim subword As String = character.ToString() - - If characterLevelVocabulary.ContainsKey(subword) Then - characterLevelVocabulary(subword) += 1 - Else - characterLevelVocabulary.Add(subword, 1) - End If - Next - Next - - Return characterLevelVocabulary - End Function - - Private Shared Function FindMostFrequentPair(vocabulary As BpeVocabulary) As BpeSubwordPair - Dim mostFrequentPair As BpeSubwordPair = Nothing - Dim maxFrequency As Integer = 0 - - For Each subword1 As String In vocabulary.Keys - For Each subword2 As String In vocabulary.Keys - If subword1 <> subword2 Then - Dim pairFrequency As Integer = CalculatePairFrequency(vocabulary, subword1, subword2) - If pairFrequency > maxFrequency Then - maxFrequency = pairFrequency - mostFrequentPair = New BpeSubwordPair(subword1, subword2, pairFrequency) - End If - End If - Next - Next - - Return mostFrequentPair - End Function - - Private Shared Function CalculatePairFrequency(vocabulary As BpeVocabulary, subword1 As String, subword2 As String) As Integer - Dim pairFrequency As Integer = 0 - - For Each word As String In vocabulary.Keys - Dim mergedWord As String = word.Replace(subword1 + subword2, subword1 + subword2.ToLower()) - Dim occurrences As Integer = 0 - Dim index As Integer = -1 - - While True - index = mergedWord.IndexOf(subword1 + subword2.ToLower(), index + 1) - If index = -1 Then - Exit While - End If - occurrences += 1 - End While - - - pairFrequency += occurrences * vocabulary(word) - Next - - Return pairFrequency - End Function - - Private Shared Function MergeSubwordPair(vocabulary As BpeVocabulary, pairToMerge As BpeSubwordPair, newSubword As String) As BpeVocabulary - Dim newVocabulary As New BpeVocabulary() - - For Each subword As String In vocabulary.Keys - Dim mergedSubword As String = subword.Replace(pairToMerge.Subword1 + pairToMerge.Subword2, newSubword) - newVocabulary(mergedSubword) = vocabulary(subword) - Next - - Return newVocabulary - End Function - End Class - Public Class TokenizerBitWord - Public Property Vocabulary As Dictionary(Of String, Integer) - Public Sub New() - Vocabulary = New Dictionary(Of String, Integer) - End Sub - Public Function Tokenize(ByRef Corpus As List(Of String)) As List(Of String) - Dim tokens As New List(Of String) - Dim Subword As String = "" - - Dim UnknownDocs As New List(Of String) - 'SubDoc Vocabulary Tokenizer - For Each doc In Corpus - For i = 0 To doc.Count - 1 - Subword &= doc(i) - If Vocabulary.ContainsKey(Subword.ToLower()) Then - tokens.Add(Subword) - Subword = "" - End If - - Next - 'Save unknowns - If Subword <> "" Then - UnknownDocs.Add(Subword) - End If - Next - 'Unknown paragraphs - Dim UnknownParagraphs As New List(Of String) - If UnknownDocs.Count > 0 Then - For Each doc In UnknownDocs - Dim Para As List(Of String) = BasicTokenizer.TokenizeToParagraph(doc) - For Each item In Para - Subword = "" - - Subword += item - If Vocabulary.ContainsKey(Subword.ToLower) Then - ' If the subword is in the Vocabulary, add it to the list of subwords - tokens.Add(Subword.ToLower) - ' Reset the subword for the next iteration - Subword = "" - End If - 'Save unknowns - If Subword <> "" Then - UnknownParagraphs.Add(Subword) - End If - Next - - Next - End If - 'Unknown Sentences - Dim UnknownSents As New List(Of String) - If UnknownParagraphs.Count > 0 Then - For Each sent In UnknownParagraphs - Dim Sents As List(Of String) = BasicTokenizer.TokenizeToSentence(sent) - - - For Each item In Sents - Subword = "" - - Subword += item - If Vocabulary.ContainsKey(Subword.ToLower) Then - ' If the subword is in the Vocabulary, add it to the list of subwords - tokens.Add(Subword.ToLower) - ' Reset the subword for the next iteration - Subword = "" - End If - 'Save unknowns - If Subword <> "" Then - UnknownSents.Add(Subword) - End If - Next - Next - End If - 'Unknown Words - Dim UnknownWords As New List(Of String) - If UnknownSents.Count > 0 Then - For Each Word In UnknownSents - Dim Words As List(Of String) = BasicTokenizer.TokenizeToWord(Word) - For Each item In Words - Subword = "" - - Subword += item - If Vocabulary.ContainsKey(Subword.ToLower) Then - ' If the subword is in the Vocabulary, add it to the list of subwords - tokens.Add(Subword.ToLower) - ' Reset the subword for the next iteration - Subword = "" - End If - 'Save unknowns - If Subword <> "" Then - UnknownWords.Add(Subword) - End If - Next - - Next - - End If - 'Unknown Words - Dim UnknownChars As New List(Of String) - If UnknownWords.Count > 0 Then - For Each iChar In UnknownWords - Dim Chars As List(Of String) = BasicTokenizer.TokenizeToCharacter(iChar) - For Each item In Chars - Subword = "" - - Subword += item - If Vocabulary.ContainsKey(Subword.ToLower) Then - ' If the subword is in the Vocabulary, add it to the list of subwords - tokens.Add(Subword.ToLower) - ' Reset the subword for the next iteration - Subword = "" - End If - 'Save unknowns - If Subword <> "" Then - UnknownChars.Add(Subword) - End If - Next - - Next - - End If - - For Each unkChar In UnknownChars - Vocabulary.Add(unkChar, 1) - Next - - Console.WriteLine("Recognized Tokens") - For Each tok In tokens - Console.WriteLine("Token =" & tok) - Next - - Console.WriteLine("UnRecognized Tokens") - For Each tok In UnknownChars - Console.WriteLine("Token =" & tok) - Next - Return tokens - End Function - - Public Sub Train(corpus As List(Of String), MaxMergeOperations As Integer) - ' Initialize the vocabulary with word-level subword units - Tokenize(corpus) - Dim mergeOperationsCount As Integer = 0 - - While mergeOperationsCount < MaxMergeOperations - ' Compute the frequency of subword units in the vocabulary - Dim subwordFrequencies As New Dictionary(Of String, Integer) - - For Each subword In Vocabulary.Keys - Dim subwordUnits = BasicTokenizer.TokenizeToCharacter(subword) - For Each unit In subwordUnits - If subwordFrequencies.ContainsKey(unit) Then - subwordFrequencies(unit) += Vocabulary(subword) - Else - subwordFrequencies.Add(unit, Vocabulary(subword)) - End If - Next - Next - - ' Find the most frequent pair of subword units - Dim mostFrequentPair As KeyValuePair(Of String, Integer) = subwordFrequencies.OrderByDescending(Function(pair) pair.Value).FirstOrDefault() - - If mostFrequentPair.Value < 2 Then - ' Stop merging if the frequency of the most frequent pair is less than 2 - Exit While - End If - - ' Merge the most frequent pair into a new subword unit - Dim newSubwordUnit = mostFrequentPair.Key - - ' Update the vocabulary by replacing occurrences of the merged subword pair with the new subword unit - Dim updatedVocabulary As New Dictionary(Of String, Integer) - - For Each subword In Vocabulary.Keys - Dim mergedSubword = subword.Replace(mostFrequentPair.Key, newSubwordUnit) - updatedVocabulary(mergedSubword) = Vocabulary(subword) - Next - - Vocabulary = updatedVocabulary - mergeOperationsCount += 1 - - End While - - End Sub - - Public Class BasicTokenizer - - Public Shared Function TokenizeToCharacter(Document As String) As List(Of String) - Document = Document.ToLower() - Dim characters As Char() = Document.ToCharArray() - TokenizeToCharacter = New List(Of String) - For Each item In characters - TokenizeToCharacter.Add(item) - Next - End Function - - Public Shared Function TokenizeToWord(Document As String) As List(Of String) - Document = Document.ToLower() - Document = Document.SpacePunctuation - Return Document.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries).ToList - End Function - - Public Shared Function TokenizeToSentence(Document As String) As List(Of String) - Document = Document.ToLower() - Document = Document.SpacePunctuation - Return Split(Document, ".").ToList - End Function - - Public Shared Function TokenizeToParagraph(Document As String) As List(Of String) - Document = Document.ToLower() - - Return Split(Document, vbNewLine).ToList - End Function - - End Class - - End Class - Public Class TokenizerPositional - Private iStopWords As List(Of String) - - Private Function RemoveStopWords(ByVal tokens As List(Of Token)) As List(Of Token) - Return tokens.Where(Function(token) Not StopWords.Contains(token.Value)).ToList() - End Function - Public Property StopWordRemovalEnabled As Boolean - - Public Property StopWords As List(Of String) - Get - Return iStopWords - End Get - Set(value As List(Of String)) - iStopWords = value - End Set - End Property - Public Structure Token - ''' - ''' Initializes a new instance of the Token structure. - ''' - ''' The type of the token. - ''' The string value of the token. - Public Sub New(ByVal type As String, ByVal value As String) - Me.Type = type - Me.Value = value - End Sub - - Public Sub New(ByVal type As TokenType, ByVal value As String, ByVal startPosition As Integer, ByVal endPosition As Integer) - Me.Type = type - Me.Value = value - Me.StartPosition = startPosition - Me.EndPosition = endPosition - End Sub - - Public Property EndPosition As Integer - Public Property StartPosition As Integer - Public Property Type As TokenType - Public Property Value As String - End Structure - - ''' - ''' Returns Tokens With Positions - ''' - ''' - ''' - Public Shared Function TokenizeByCharacter(ByVal input As String) As List(Of Token) - Dim characters As Char() = input.ToCharArray() - Dim tokens As New List(Of Token) - Dim currentPosition As Integer = 0 - - For Each character As Char In characters - Dim startPosition As Integer = currentPosition - Dim endPosition As Integer = currentPosition - Dim token As New Token(TokenType.Character, character.ToString(), startPosition, endPosition) - tokens.Add(token) - currentPosition += 1 - Next - - Return tokens - End Function - - ''' - ''' Returns Tokens With Positions - ''' - ''' - ''' - Public Shared Function TokenizeBySentence(ByVal input As String) As List(Of Token) - Dim sentences As String() = input.Split("."c) - Dim tokens As New List(Of Token) - Dim currentPosition As Integer = 0 - - For Each sentence As String In sentences - Dim startPosition As Integer = currentPosition - Dim endPosition As Integer = currentPosition + sentence.Length - 1 - Dim token As New Token(TokenType.Sentence, sentence, startPosition, endPosition) - tokens.Add(token) - currentPosition = endPosition + 2 ' Account for the period and the space after the sentence - Next - - Return tokens - End Function - - ''' - ''' Returns Tokens With Positions - ''' - ''' - ''' - Public Shared Function TokenizeByWord(ByVal input As String) As List(Of Token) - Dim words As String() = input.Split(" "c) - Dim tokens As New List(Of Token) - Dim currentPosition As Integer = 0 - - For Each word As String In words - Dim startPosition As Integer = currentPosition - Dim endPosition As Integer = currentPosition + word.Length - 1 - Dim token As New Token(TokenType.Word, word, startPosition, endPosition) - tokens.Add(token) - currentPosition = endPosition + 2 ' Account for the space between words - Next - - Return tokens - End Function - - ''' - ''' Pure basic Tokenizer to Tokens - ''' - ''' - ''' Type Of Tokenization - ''' - Public Shared Function TokenizeInput(ByRef Corpus As List(Of String), tokenizationOption As TokenizerType) As List(Of Token) - Dim ivocabulary As New List(Of Token) - - For Each Doc In Corpus - Select Case tokenizationOption - Case TokenizerType._Char - ivocabulary.AddRange(TokenizeByCharacter(Doc.ToLower)) - Case TokenizerType._Word - ivocabulary.AddRange(TokenizeByWord(Doc.ToLower)) - Case TokenizerType._Sentence - ivocabulary.AddRange(TokenizeBySentence(Doc.ToLower)) - - - End Select - Next - - Return ivocabulary - End Function - - End Class - Public Class TokenizerTokenID - Public TokenToId As New Dictionary(Of String, Integer) - Private idToToken As New Dictionary(Of Integer, String) - Private nextId As Integer = 0 - - Private vocab As New Dictionary(Of String, Integer) - Public Sub New(ByRef Vocabulary As Dictionary(Of String, Integer)) - vocab = Vocabulary - TokenToId = New Dictionary(Of String, Integer) - idToToken = New Dictionary(Of Integer, String) - End Sub - - ''' - ''' Pure Tokenizer (will tokenize based on the Tokenizer model settings) - ''' - ''' - ''' - Public Function TokenizeToTokenIDs(text As String) As List(Of Integer) - Dim tokens = TokenizerPositional.TokenizeByWord(text) - Dim tokenIds As New List(Of Integer) - - For Each itoken In tokens - Dim tokenId As Integer - If TokenToId.ContainsKey(itoken.Value) Then - tokenId = TokenToId(itoken.Value) - Else - 'Not registered - - tokenId = TokenToId(itoken.Value) - - End If - tokenIds.Add(tokenId) - - Next - - Return tokenIds - End Function - - Private Sub AddTokenID(text As String) - - If Not vocab.ContainsKey(text) Then - vocab(text) = nextId - nextId += 1 - TokenToId = vocab.ToDictionary(Function(x) x.Key, Function(x) x.Value) - idToToken = TokenToId.ToDictionary(Function(x) x.Value, Function(x) x.Key) - End If - - - End Sub - - ''' - ''' Given a Set of Token ID Decode the Tokens - ''' - ''' - ''' - Public Function Detokenize(tokenIds As List(Of Integer)) As String - Dim tokens As New List(Of String) - - For Each tokenId As Integer In tokenIds - tokens.Add(idToToken(tokenId)) - Next - - Return String.Join(" ", tokens) - End Function - End Class - - - Public Class Tokenizer - Public Property Vocabulary As Dictionary(Of String, Integer) - Public ReadOnly Property PairFrequencies As Dictionary(Of String, Integer) = ComputePairFrequencies() - Public ReadOnly Property maxSubwordLen As Integer = Me.Vocabulary.Max(Function(token) token.Key.Length) - Private ReadOnly unkToken As String = "" - ''' - ''' Defines max entries in vocabulary before Pruning Rare Words - ''' - ''' - Public Property VocabularyPruneValue As Integer = 100000 - - Public Sub New() - Vocabulary = New Dictionary(Of String, Integer) - - End Sub - Public Function GetVocabulary() As List(Of String) - Return Vocabulary.Keys.ToList() - End Function - - Public Sub New(vocabulary As Dictionary(Of String, Integer), Optional vocabularyPruneValue As Integer = 1000000) - Me.Vocabulary = vocabulary - Me.VocabularyPruneValue = vocabularyPruneValue - End Sub - - Private Function TokenizeWordPiece(text As String) As List(Of String) - Dim tokens As New List(Of String) - Dim pos As Integer = 0 - - While pos < text.Length - Dim foundSubword As Boolean = False - Dim subword As String = "" - - For subwordLen As Integer = Math.Min(Me.maxSubwordLen, text.Length - pos) To 1 Step -1 - subword = text.Substring(pos, subwordLen) - - If Vocabulary.Keys.Contains(subword) Then - tokens.Add(subword) - pos += subwordLen - foundSubword = True - Exit For - End If - Next - - ' If no subword from the vocabulary matches, split into WordPiece tokens - If Not foundSubword Then - Dim wordPieceTokens As List(Of String) = TokenizeBitWord(subword) - tokens.AddRange(wordPieceTokens) - UpdateVocabulary(subword) - pos += subword.Length - End If - End While - - Return tokens - End Function - Private Function TokenizeBitWord(subword As String) As List(Of String) - Dim wordPieceTokens As New List(Of String) - Dim startIdx As Integer = 0 - - While startIdx < subword.Length - Dim endIdx As Integer = subword.Length - Dim foundSubword As Boolean = False - - While startIdx < endIdx - Dim candidate As String = subword.Substring(startIdx, endIdx - startIdx) - Dim isLast = endIdx = subword.Length - - If Vocabulary.Keys.Contains(candidate) OrElse isLast Then - wordPieceTokens.Add(candidate) - startIdx = endIdx - foundSubword = True - Exit While - End If - - endIdx -= 1 - End While - - ' If no subword from the vocabulary matches, break the subword into smaller parts - If Not foundSubword Then - wordPieceTokens.Add("") - startIdx += 1 - End If - End While - - Return wordPieceTokens - End Function - Private Shared Function TokenizeBitWord(subword As String, ByRef Vocab As Dictionary(Of String, Integer)) As List(Of String) - - Dim wordPieceTokens As New List(Of String) - Dim startIdx As Integer = 0 - - While startIdx < subword.Length - Dim endIdx As Integer = subword.Length - Dim foundSubword As Boolean = False - - While startIdx < endIdx - Dim candidate As String = subword.Substring(startIdx, endIdx - startIdx) - Dim isLast = endIdx = subword.Length - - If Vocab.Keys.Contains(candidate) OrElse isLast Then - wordPieceTokens.Add(candidate) - startIdx = endIdx - foundSubword = True - Exit While - End If - - endIdx -= 1 - End While - - ' If no subword from the vocabulary matches, break the subword into smaller parts - If Not foundSubword Then - wordPieceTokens.Add("") - startIdx += 1 - End If - End While - - Return wordPieceTokens - End Function - - Private Function TokenizeBPE(ByVal text As String) As List(Of String) - Dim tokens As New List(Of String) - - While text.Length > 0 - Dim foundToken As Boolean = False - - ' Find the longest token in the vocabulary that matches the start of the text - For Each subword In Vocabulary.OrderByDescending(Function(x) x.Key.Length) - If text.StartsWith(subword.Key) Then - tokens.Add(subword.Key) - text = text.Substring(subword.Key.Length) - foundToken = True - Exit For - End If - Next - - ' If no token from the vocabulary matches, break the text into subwords - If Not foundToken Then - Dim subwordFound As Boolean = False - Dim subword As String = "" - ' Divide the text into subwords starting from the longest possible length - For length = Math.Min(text.Length, 20) To 1 Step -1 - subword = text.Substring(0, length) - - ' Check if the subword is in the vocabulary - If Vocabulary.Keys(subword) Then - tokens.Add(subword) - text = text.Substring(length) - subwordFound = True - Exit For - End If - Next - - ' If no subword from the vocabulary matches, - 'Learn On the fly, But - If Not subwordFound Then - ' Throw New Exception("Unrecognized subword in the text.") - tokens.AddRange(TokenizeBitWord(unkToken & subword)) - UpdateVocabulary(subword) - - End If - End If - End While - - Return tokens - End Function - Private Class NgramTokenizer - - Public Shared Function TokenizetoCharacter(Document As String, n As Integer) As List(Of String) - TokenizetoCharacter = New List(Of String) - Document = Document.ToLower() - Document = Document.SpacePunctuation - - ' Generate character n-grams - For i As Integer = 0 To Document.Length - n - Dim ngram As String = Document.Substring(i, n) - TokenizetoCharacter.Add(ngram) - Next - - End Function - - Public Shared Function TokenizetoWord(ByRef text As String, n As Integer) As List(Of String) - TokenizetoWord = New List(Of String) - text = text.ToLower() - text = text.SpacePunctuation - - ' Split the clean text into individual words - Dim words() As String = text.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) - - ' Generate n-grams from the words - For i As Integer = 0 To words.Length - n - Dim ngram As String = String.Join(" ", words.Skip(i).Take(n)) - TokenizetoWord.Add(ngram) - Next - - End Function - - Public Shared Function TokenizetoParagraph(text As String, n As Integer) As List(Of String) - TokenizetoParagraph = New List(Of String) - - ' Split the text into paragraphs - Dim paragraphs() As String = text.Split({Environment.NewLine & Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries) - - ' Generate paragraph n-grams - For i As Integer = 0 To paragraphs.Length - n - Dim ngram As String = String.Join(Environment.NewLine & Environment.NewLine, paragraphs.Skip(i).Take(n)) - TokenizetoParagraph.Add(ngram) - Next - - Return TokenizetoParagraph - End Function - - Public Shared Function TokenizetoSentence(text As String, n As Integer) As List(Of String) - Dim tokens As New List(Of String) - - ' Split the text into Clauses - Dim Clauses() As String = text.Split({".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) - - ' Generate sentence n-grams - For i As Integer = 0 To Clauses.Length - n - Dim ngram As String = String.Join(" ", Clauses.Skip(i).Take(n)) - tokens.Add(ngram) - Next - - Return tokens - End Function - - End Class - Private Class BasicTokenizer - - Public Shared Function TokenizeToCharacter(Document As String) As List(Of String) - TokenizeToCharacter = New List(Of String) - Document = Document.ToLower() - For i = 0 To Document.Length - 1 - TokenizeToCharacter.Add(Document(i)) - Next - End Function - - Public Shared Function TokenizeToWord(Document As String) As List(Of String) - Document = Document.ToLower() - Document = Document.SpacePunctuation - Return Document.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries).ToList - End Function - - Public Shared Function TokenizeToSentence(Document As String) As List(Of String) - Document = Document.ToLower() - Document = Document.SpacePunctuation - Return Split(Document, ".").ToList - Return Document.Split({".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries).ToList - End Function - - Public Shared Function TokenizeToParagraph(Document As String) As List(Of String) - Document = Document.ToLower() - Return Split(Document, vbNewLine).ToList - End Function - - End Class - Public Sub Add_Vocabulary(initialVocabulary As List(Of String)) - - For Each word In initialVocabulary - - UpdateVocabulary(word) - + For Each chunk As String In data + jsonData.Add(New With {.content = chunk}) Next + Dim jsonText As String = JsonConvert.SerializeObject(jsonData, Formatting.Indented) + File.WriteAllText(outputPath, jsonText) + End Sub + Public Shared Sub OutputToListOfLists(data As List(Of String), outputPath As String) + File.WriteAllLines(outputPath, data) End Sub - Public Sub Initialize_Vocabulary(initialVocabulary As List(Of String), n As Integer) - For Each word In initialVocabulary - For i As Integer = 0 To word.Length - n - UpdateVocabulary(word.Substring(i, n)) - Next + Public Shared Sub OutputToStructured(entityChunks As List(Of KeyValuePair(Of String, String)), outputPath As String) + Dim structuredData As New List(Of Object) + For Each entityChunk As KeyValuePair(Of String, String) In entityChunks + structuredData.Add(New With { + .entityType = entityChunk.Key, + .content = entityChunk.Value + }) Next - + Dim jsonText As String = JsonConvert.SerializeObject(structuredData, Formatting.Indented) + File.WriteAllText(outputPath, jsonText) End Sub - Private Function ComputePairFrequencies() As Dictionary(Of String, Integer) - Dim pairFrequencies As Dictionary(Of String, Integer) = New Dictionary(Of String, Integer) - For Each token As String In Vocabulary.Keys - Dim tokenChars As List(Of Char) = token.ToList() + Public Shared Function ProcessFile(inputPath As String, outputDirectory As String, entityListfilePath As String, maxSize As Integer, useFiltering As Boolean, chunkType As ChunkType) As List(Of String) + Dim rawData As String = File.ReadAllText(inputPath) + Dim chunks As List(Of String) = Chunk(rawData, chunkType, maxSize) - For i As Integer = 0 To tokenChars.Count - 2 - Dim pair As String = tokenChars(i) & tokenChars(i + 1) + ' Load entity list if filtering is selected + If useFiltering Then + Dim filterList = EntityLoader.LoadEntityListFromFile(entityListfilePath) - If Not pairFrequencies.ContainsKey(pair) Then - pairFrequencies.Add(pair, Vocabulary(token)) - Else - Dim value = pairFrequencies(pair) - value += Vocabulary(token) - pairFrequencies.Remove(pair) - pairFrequencies.Add(pair, value) + ' Detect and output structured entities + Dim entityChunks As List(Of KeyValuePair(Of String, String)) = EntityLoader.DetectEntities(chunks, filterList) + OutputToStructured(entityChunks, Path.Combine(outputDirectory, "entity_output.txt")) + End If + If maxSize > 0 Then + ' Apply padding based on maxSize + chunks = ApplyPadding(chunks, maxSize) + Else + End If + + ' Output to different formats + OutputToListOfLists(chunks, Path.Combine(outputDirectory, "output.txt")) + OutputToCSV(chunks, Path.Combine(outputDirectory, "output.csv")) + OutputToJSON(chunks, Path.Combine(outputDirectory, "output.json")) + + ' Create punctuation vocabulary + Return chunks + End Function + Public Function ApplyFiltering(chunks As List(Of String), filterList As List(Of KeyValuePair(Of String, String))) As List(Of String) + Dim filteredChunks As New List(Of String) + For Each chunk As String In chunks + For Each filterItem As KeyValuePair(Of String, String) In filterList + If chunk.Contains(filterItem.Value) Then + filteredChunks.Add(chunk) + Exit For End If Next Next - Return pairFrequencies + Return filteredChunks End Function - Private Sub UpdateFrequencyDictionary(mergedSubword As String) - PairFrequencies.Remove("") - For i As Integer = 0 To mergedSubword.Length - 2 - Dim bigram As String = mergedSubword.Substring(i, 2) - If PairFrequencies.ContainsKey(bigram) Then - PairFrequencies(bigram) += 1 + Public Function ApplyPadding(chunks As List(Of String)) As List(Of String) + ' Padding logic for text data chunks + Dim paddedChunks As New List(Of String) + + For Each chunk As String In chunks + If chunk.Length > maxSize Then + ' Apply padding if chunk size exceeds maxSize + paddedChunks.Add(chunk.Substring(0, maxSize)) Else - PairFrequencies.Add(bigram, 1) + paddedChunks.Add(chunk) End If Next - End Sub - Public Sub UpdateVocabulary(ByRef Term As String) - If Vocabulary.Keys.Contains(Term) = True Then - Dim value = Vocabulary(Term) - value += 1 - Vocabulary.Remove(Term) - Vocabulary.Add(Term, value) - Else - Vocabulary.Add(Term, 1) - End If - End Sub - Public Shared Function UpdateCorpusWithMergedToken(ByRef corpus As List(Of String), pair As String) As List(Of String) - ' Update the text corpus with the merged token for the next iteration. - Return corpus.ConvertAll(Function(text) text.Replace(pair, pair.Replace(" ", "_"))) + Return paddedChunks End Function - Public Sub Prune(pruningThreshold As Integer) - - Dim minimumVocabularySize As Integer = VocabularyPruneValue - If Vocabulary.Count > minimumVocabularySize Then - PruneVocabulary(pruningThreshold) - End If - - End Sub - Private Sub PruneVocabulary(threshold As Integer) - ' Create a list to store tokens to be removed. - Dim tokensToRemove As New List(Of String) - - ' Iterate through the vocabulary and identify tokens to prune. - For Each token In Vocabulary - Dim tokenId As Integer = token.Value - Dim tokenFrequency As Integer = Vocabulary(token.Key) - - ' Prune the token if it has frequency below the threshold (1) and is not recent (has a lower ID). - If tokenFrequency <= threshold AndAlso tokenId < Vocabulary.Count - 1 Then - tokensToRemove.Add(token.Key) - End If - Next - - ' Remove the identified tokens from the vocabulary. - For Each tokenToRemove In tokensToRemove - Vocabulary.Remove(tokenToRemove) - Next - - Console.WriteLine("Pruning completed. Vocabulary size after pruning: " & Vocabulary.Count) - Console.ReadLine() - End Sub - Public Sub Train(text As String, Epochs As Integer) - ' Tokenize the text into individual characters - Dim Bits As List(Of String) = TokenizeBitWord(text) - For Each bit As String In Bits - UpdateVocabulary(bit) - Next - - - ' Train BPE using merging strategy - Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed - For mergeIndex As Integer = 0 To numMerges - 1 - MergeMostFrequentBigram() - MergeMostFrequentPair(FindMostFrequentPair.Key) - Next + Public Function Chunk(data As String, chunkType As ChunkType) As List(Of String) + ' Chunking logic for text data based on chunkType + Dim chunks As New List(Of String) - Prune(1) - End Sub - Public Function Tokenize(singleDocument As String, isWordPiece As Boolean) As List(Of String) - ' Tokenize the document using the current vocabulary. - Dim tokens As List(Of String) = If(isWordPiece, Tokenize(singleDocument, True), Tokenize(singleDocument, False)) - If tokens.Contains(unkToken) = True Then - tokens = TrainAndTokenize(singleDocument, isWordPiece, 1) + Select Case chunkType + Case ChunkType.Sentence + ' Split into sentences + chunks.AddRange(data.Split("."c)) + Case ChunkType.Paragraph + ' Split into paragraphs + chunks.AddRange(data.Split(Environment.NewLine)) + Case ChunkType.Document + ' Treat the whole data as a document + chunks.Add(data) + End Select + If maxSize > 0 Then + ' Apply padding based on maxSize + chunks = ApplyPadding(chunks) End If - Return tokens + + Return chunks End Function - Private Function TrainAndTokenize(singleDocument As String, isWordPiece As Boolean, Epochs As Integer) As List(Of String) - ' Tokenize the document using the current vocabulary. - Dim tokens As List(Of String) = If(isWordPiece, Tokenize(singleDocument, True), Tokenize(singleDocument, False)) - ' Train the tokenizer using the same document. - If isWordPiece Then - TrainWordPiece(singleDocument, Epochs) - Else - TrainBPE(singleDocument, Epochs) - End If + Public Function CustomizeChunkingAndPadding(data As String) As List(Of String) + Dim chunks As List(Of String) = Chunk(data, chunkType) - ' Re-tokenize the document with the updated vocabulary. - Return If(isWordPiece, TokenizeWordPiece(singleDocument), TokenizeBPE(singleDocument)) - End Function - Public Sub Train(text As String, isWordPiece As Boolean, Epochs As Integer) - If isWordPiece Then - TrainWordPiece(text, Epochs) - Else - TrainBPE(text, Epochs) + If maxSize > 0 Then + chunks = ApplyPadding(chunks) End If - Prune(1) - End Sub - Private Sub TrainWordPiece(text As String, Epochs As Integer) - ' Tokenize the text into individual characters - Dim Bits As List(Of String) = TokenizeWordPiece(text) - For Each bit As String In Bits - UpdateVocabulary(bit) - Next - - ' Train WordPiece using merging strategy - Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed - For mergeIndex As Integer = 0 To numMerges - 1 - MergeMostFrequentBigram() - MergeMostFrequentPair(FindMostFrequentPair.Key) - Next - End Sub - Private Sub TrainBPE(text As String, Epochs As Integer) - ' Tokenize the text into individual characters - Dim Bits As List(Of String) = TokenizeBPE(text) - For Each bit As String In Bits - UpdateVocabulary(bit) - Next - ' Train BPE using merging strategy - Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed - For mergeIndex As Integer = 0 To numMerges - 1 - MergeMostFrequentBigram() - MergeMostFrequentPair(FindMostFrequentPair.Key) - Next - End Sub - Private Function FindMostFrequentPair() As KeyValuePair(Of String, Integer) - ' Find the most frequent character pair from the frequency counts. - Return PairFrequencies.Aggregate(Function(x, y) If(x.Value > y.Value, x, y)) + Return chunks End Function - Private Sub MergeMostFrequentPair(pair As String) - ' Merge the most frequent character pair into a new subword unit. - Dim mergedToken As String = pair.Replace(" ", "_") - UpdateVocabulary(mergedToken) - End Sub - Private Sub MergeMostFrequentBigram() - Dim mostFrequentBigram As String = GetMostFrequentBigram() - If mostFrequentBigram IsNot Nothing Then - Dim mergedSubword As String = mostFrequentBigram.Replace("", " ") + ''' + ''' Filters out chunks containing specific punctuation marks or symbols. + ''' + ''' The list of processed text data chunks. + ''' A list of filtered text data chunks. + Public Function FilterUsingPunctuationVocabulary(data As List(Of String), ByRef punctuationVocabulary As HashSet(Of String)) As List(Of String) + Dim filteredData As New List(Of String) - UpdateVocabulary(mergedSubword) + For Each chunk As String In data + Dim symbols As String() = chunk.Split().Where(Function(token) Not Char.IsLetterOrDigit(token(0))).ToArray() - End If - End Sub - Private Function GetMostFrequentBigram() As String - Dim mostFrequentBigram As String = Nothing - Dim maxFrequency As Integer = 0 - - For Each bigram In PairFrequencies.Keys - If PairFrequencies(bigram) > maxFrequency Then - mostFrequentBigram = bigram - maxFrequency = PairFrequencies(bigram) + Dim containsPunctuation As Boolean = False + For Each symbol As String In symbols + If punctuationVocabulary.Contains(symbol) Then + containsPunctuation = True + Exit For + End If + Next + + If Not containsPunctuation Then + filteredData.Add(chunk) End If Next - Return mostFrequentBigram + Return filteredData End Function - Public Shared Function FindFrequentCharacterBigrams(Vocab As List(Of String), ByRef Freq_Threshold As Integer) As List(Of String) - Dim bigramCounts As New Dictionary(Of String, Integer) - - For Each word In Vocab - Dim characters As Char() = word.ToCharArray() + Public Sub ProcessAndFilterChunks(inputPath As String, outputPath As String, filterListPath As String, chunkType As ChunkType, maxSize As Integer) + Dim rawData As String = File.ReadAllText(inputPath) + Dim chunks As List(Of String) = Chunk(rawData, chunkType, maxSize) - For i As Integer = 0 To characters.Length - 2 - Dim bigram As String = characters(i) & characters(i + 1) + If Not String.IsNullOrEmpty(filterListPath) Then + Dim filterList As List(Of KeyValuePair(Of String, String)) = EntityLoader.LoadEntityListFromFile(filterListPath) + chunks = ApplyFiltering(chunks, filterList) + End If - If bigramCounts.ContainsKey(bigram) Then - bigramCounts(bigram) += 1 - Else - bigramCounts.Add(bigram, 1) - End If - Next - Next + ' Apply padding if maxSize is specified + If maxSize > 0 Then + chunks = ApplyPadding(chunks, maxSize) + End If - Dim frequentCharacterBigrams As New List(Of String) + ' Output to different formats + OutputToListOfLists(chunks, Path.Combine(outputPath, "output.txt")) + OutputToCSV(chunks, Path.Combine(outputPath, "output.csv")) + OutputToJSON(chunks, Path.Combine(outputPath, "output.json")) + End Sub - For Each pair In bigramCounts - If pair.Value > Freq_Threshold Then ' Adjust the threshold as needed - frequentCharacterBigrams.Add(pair.Key) - End If - Next + Public Function ProcessFile(inputPath As String, outputDirectory As String) + Dim rawData As String = File.ReadAllText(inputPath) + Dim chunks As List(Of String) = Chunk(rawData, chunkType) - Return frequentCharacterBigrams - End Function - Public Shared Function GetHighFreq(ByRef Vocabulary As Dictionary(Of String, Integer), ByRef Threshold As Integer) As List(Of String) - Dim HighFreq As New List(Of String) - For Each item In Vocabulary - If item.Value > Threshold Then - HighFreq.Add(item.Key) - End If - Next - Return HighFreq - End Function - Public Shared Function TokenizeToCharacter(text As String) As List(Of String) - Return BasicTokenizer.TokenizeToCharacter(text) - End Function - Public Shared Function TokenizeToWord(text As String) As List(Of String) - Return BasicTokenizer.TokenizeToWord(text) - End Function - Public Shared Function TokenizeToSentence(text As String) As List(Of String) - Return BasicTokenizer.TokenizeToSentence(text) - End Function - Public Shared Function TokenizeToSentenceGram(text As String, ByRef n As Integer) As List(Of String) - Return NgramTokenizer.TokenizetoSentence(text, n) - End Function - Public Shared Function TokenizeToWordGram(text As String, ByRef n As Integer) As List(Of String) - Return NgramTokenizer.TokenizetoWord(text, n) - End Function - Public Shared Function TokenizeToNGram(text As String, ByRef n As Integer) As List(Of String) - Return NgramTokenizer.TokenizetoCharacter(text, n) - End Function - Public Shared Function TokenizeToBitWord(text As String, ByRef Vocab As Dictionary(Of String, Integer)) As List(Of String) - Dim Words = Tokenizer.TokenizeToWord(text) - Dim Tokens As New List(Of String) - For Each item In Words - Tokens.AddRange(TokenizeBitWord(item, Vocab)) - Next - Return Tokens + ' Output to different formats + OutputToListOfLists(chunks, Path.Combine(outputDirectory, "output.txt")) + OutputToCSV(chunks, Path.Combine(outputDirectory, "output.csv")) + OutputToJSON(chunks, Path.Combine(outputDirectory, "output.json")) + Return chunks End Function + End Class End Namespace + Namespace MatrixModels + Public Class PMI ''' ''' Calculates the Pointwise Mutual Information (PMI) matrix for the trained model. @@ -2000,6 +576,7 @@ PROC_ERR: ''' ''' Returns a list WordGram Probability Given a Sequence of Tokens ''' + Public Class Wordgram Private n As Integer Public Shared Sub Main() @@ -2184,6 +761,7 @@ PROC_ERR: End Class + Public Class Co_Occurrence_Matrix Public Shared Function PrintOccurrenceMatrix(ByRef coOccurrenceMatrix As Dictionary(Of String, Dictionary(Of String, Integer)), entityList As List(Of String)) As String ' Prepare the header row @@ -2331,6 +909,7 @@ PROC_ERR: End Class + Public Class Word2WordMatrix Private matrix As Dictionary(Of String, Dictionary(Of String, Integer)) @@ -2462,7 +1041,20 @@ PROC_ERR: Return matrix End Function End Class + + Public Class ConditionalProbabilityTable + Public Property Node As BeliefNode + Public Property Values As Dictionary(Of List(Of String), Double) + + Public Sub New(node As BeliefNode) + Me.Node = node + Values = New Dictionary(Of List(Of String), Double) + End Sub + Public Sub SetEntry(parentStates As List(Of String), value As Double) + Values(parentStates) = value + End Sub + End Class End Namespace Namespace Readers Public Class WordListReader @@ -2956,6 +1548,7 @@ PROC_ERR: End Namespace Namespace VocabularyModelling + Public Class VocabularyBuilder Private embeddingMatrix As Double(,) Private embeddingSize As Integer @@ -3268,6 +1861,7 @@ PROC_ERR: End Function End Class + Public Class VocabularyGenerator Public Shared Function CreateDictionaryVocabulary(data As List(Of String)) As HashSet(Of String) @@ -3580,6 +2174,7 @@ Namespace EncoderDecoders ''' DecodeSentenceStr: Decodes a list Of positional embeddings And returns the corresponding String tokens As a list Of strings. ''' DecodeSentenceEmbedding: Decodes a list Of positional embeddings And returns the corresponding token embeddings As a list Of lists Of doubles. ''' + Public Class PositionalEncoderDecoder Private encodingMatrix As List(Of List(Of Double)) Private Vocabulary As New List(Of String) @@ -3709,73 +2304,130 @@ Namespace EncoderDecoders End Namespace Namespace Utilitys - Public Module TextProcessingTasks - - - Public Function PerformTasks(ByRef Txt As String, ByRef Tasks As List(Of TextPreProcessingTasks)) As String - - For Each tsk In Tasks - Select Case tsk - - Case TextPreProcessingTasks.Space_Punctuation - - Txt = SpacePunctuation(Txt).Replace(" ", " ") - Case TextPreProcessingTasks.To_Upper - Txt = Txt.ToUpper.Replace(" ", " ") - Case TextPreProcessingTasks.To_Lower - Txt = Txt.ToLower.Replace(" ", " ") - Case TextPreProcessingTasks.Lemmatize_Text - Case TextPreProcessingTasks.Tokenize_Characters - Txt = TokenizeChars(Txt) - Dim Words() As String = Txt.Split(",") - Txt &= vbNewLine & "Total Tokens in doc -" & Words.Count - 1 & ":" & vbNewLine - Case TextPreProcessingTasks.Remove_Stop_Words - TextExtensions.RemoveStopWords(Txt) - Case TextPreProcessingTasks.Tokenize_Words - Txt = TokenizeWords(Txt) - Dim Words() As String = Txt.Split(",") - Txt &= vbNewLine & "Total Tokens in doc -" & Words.Count - 1 & ":" & vbNewLine - Case TextPreProcessingTasks.Tokenize_Sentences - Txt = TokenizeSentences(Txt) - Dim Words() As String = Txt.Split(",") - Txt &= vbNewLine & "Total Tokens in doc -" & Words.Count - 2 & ":" & vbNewLine - Case TextPreProcessingTasks.Remove_Symbols - Txt = RemoveSymbols(Txt).Replace(" ", " ") - Case TextPreProcessingTasks.Remove_Brackets - Txt = RemoveBrackets(Txt).Replace(" ", " ") - Case TextPreProcessingTasks.Remove_Maths_Symbols - Txt = RemoveMathsSymbols(Txt).Replace(" ", " ") - Case TextPreProcessingTasks.Remove_Punctuation - Txt = RemovePunctuation(Txt).Replace(" ", " ") - Case TextPreProcessingTasks.AlphaNumeric_Only - Txt = AlphaNumericOnly(Txt).Replace(" ", " ") - End Select + +End Namespace +Public Module Extensions + + Public Sub ModelExporter(ByRef Model As Object, Filename As String) + Dim path As String = Application.StartupPath + + Dim FileStream As New System.IO.FileStream(Filename, System.IO.FileMode.CreateNew) + Dim Formatter As New BinaryFormatter + Formatter.Serialize(Model, FileStream) + FileStream.Close() + + + End Sub + + + + + ''' + ''' Extracts words between based on the before and after words + ''' IE: THe cat sat on the mat (before The After The) output: cat sat on + ''' + ''' + ''' + ''' + ''' + + Public Function ExtractWordsBetween(sentence As String, beforeWord As String, afterWord As String) As List(Of String) + Dim words As New List(Of String)() + + Dim sentenceWords As String() = sentence.Split(" "c) + Dim startIndex As Integer = -1 + Dim endIndex As Integer = -1 + + ' Find the starting and ending indices of the target words + For i As Integer = 0 To sentenceWords.Length - 1 + If sentenceWords(i).Equals(beforeWord, StringComparison.OrdinalIgnoreCase) Then + startIndex = i + End If + + If sentenceWords(i).Equals(afterWord, StringComparison.OrdinalIgnoreCase) Then + endIndex = i + End If + Next + + ' Extract words between the target words + If startIndex <> -1 AndAlso endIndex <> -1 AndAlso startIndex < endIndex Then + For i As Integer = startIndex + 1 To endIndex - 1 + words.Add(sentenceWords(i)) Next + End If - Return Txt - End Function + Return words + End Function + + + Public Function StartsWithAny(str As String, values As IEnumerable(Of String)) As Boolean + For Each value As String In values + If str.StartsWith(value) Then + Return True + End If + Next + + Return False + End Function + + + + Public Function StartsWithAny(ByVal input As String, ByVal values As String()) As Boolean + For Each value As String In values + If input.StartsWith(value, StringComparison.OrdinalIgnoreCase) Then + Return True + End If + Next + Return False + End Function - Public Enum TextPreProcessingTasks - Space_Punctuation - To_Upper - To_Lower - Lemmatize_Text - Tokenize_Characters - Remove_Stop_Words - Tokenize_Words - Tokenize_Sentences - Remove_Symbols - Remove_Brackets - Remove_Maths_Symbols - Remove_Punctuation - AlphaNumeric_Only - End Enum - - End Module -End Namespace -Public Module Extensions + + Public Function ContainsAny(text As String, indicators As String()) As Boolean + For Each indicator As String In indicators + If text.Contains(indicator) Then + Return True + End If + Next + + Return False + End Function + + + + + + Public Function ModelImporter(ByRef Filename As String) As Object + Dim FileStream As New System.IO.FileStream(Filename, System.IO.FileMode.CreateNew) + Dim Formatter As New BinaryFormatter + Dim Model As Object = Formatter.Deserialize(FileStream) + FileStream.Close() + + Return Model + End Function + ''' + ''' Writes the contents of an embedded resource embedded as Bytes to disk. + ''' + ''' Embedded resource + ''' Save to file + ''' + + Public Sub FileSave(ByVal BytesToWrite() As Byte, ByVal FileName As String) + + If IO.File.Exists(FileName) Then + IO.File.Delete(FileName) + End If + + Dim FileStream As New System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate) + Dim BinaryWriter As New System.IO.BinaryWriter(FileStream) + + BinaryWriter.Write(BytesToWrite) + BinaryWriter.Close() + FileStream.Close() + End Sub + + Public Function ConvertToDataTable(Of T)(ByVal list As IList(Of T)) As DataTable Dim table As New DataTable() Dim fields() = GetType(T).GetFields() @@ -3791,7 +2443,7 @@ Public Module Extensions Next Return table End Function - + Public Function CreateDataTable(ByRef HeaderTitles As List(Of String)) As DataTable Dim DT As New DataTable For Each item In HeaderTitles @@ -3878,7 +2530,7 @@ Public Module Extensions Dim Converter As New JavaScriptSerializer Return Converter.Serialize(iObject) End Function - + Function CalculateWordOverlap(tokens1 As String(), tokens2 As String()) As Integer Dim overlap As Integer = 0 @@ -3923,76 +2575,6 @@ Public Module Extensions End Enum - Public Class PunctuationMarkers - Public Shared ReadOnly SeperatorPunctuation() As String = {" ", ",", "|"} - Public Shared ReadOnly Symbols() As String = {"@", "#", "$", "%", "&", "*", "+", "=", "^", "_", "~", "§", "°", "¿", "¡"} - Public Shared ReadOnly EncapuslationPunctuationEnd() As String = {"}", "]", ">", ")"} - Public Shared ReadOnly EncapuslationPunctuationStart() As String = {"{", "[", "<", "("} - Public Shared ReadOnly GramaticalPunctuation() As String = {".", "?", "!", ":", ";", ","} - Public Shared ReadOnly MathPunctuation = New String() {"+", "-", "*", "/", "=", "<", ">", "≤", "≥", "±", "≈", "≠", "%", "‰", "‱", "^", "_", "√", "∛", "∜", "∫", "∬", "∭", "∮", "∯", "∰", "∇", "∂", "∆", "∏", "∑", "∐", "⨀", "⨁", "⨂", "⨃", "⨄", "∫", "∬", "∭", "∮", "∯", "∰", "∇", "∂", "∆", "∏", "∑", "∐", "⨀", "⨁", "⨂", "⨃", "⨄"} - Public Shared ReadOnly MoneyPunctuation() As String = {"$", "€", "£", "¥", "₹", "₽", "₿"} - Public Shared ReadOnly CodePunctuation() As String = {"\", "#", "@", "^"} - - Public Shared ReadOnly Delimiters() As Char = {CType(" ", Char), CType(".", Char), - CType(",", Char), CType("?", Char), - CType("!", Char), CType(";", Char), - CType(":", Char), Chr(10), Chr(13), vbTab} - - Public ReadOnly Property SentenceEndPunctuation As List(Of String) - Get - Dim markers() As String = {".", ";", ":", "!", "?"} - Return markers.ToList - End Get - End Property - - Public Shared ReadOnly Property Punctuation As List(Of String) - Get - Dim x As New List(Of String) - x.AddRange(SeperatorPunctuation) - x.AddRange(Symbols) - x.AddRange(EncapuslationPunctuationStart) - x.AddRange(EncapuslationPunctuationEnd) - x.AddRange(MoneyPunctuation) - x.AddRange(MathPunctuation) - x.AddRange(GramaticalPunctuation) - x.AddRange(CodePunctuation) - Return x.Distinct.ToList - End Get - End Property - - End Class - Public Enum TokenType - GramaticalPunctuation - EncapuslationPunctuationStart - EncapuslationPunctuationEnd - MoneyPunctuation - MathPunctuation - CodePunctuation - AlphaBet - Number - Symbol - SeperatorPunctuation - Ignore - Word - Sentence - Character - Ngram - WordGram - SentenceGram - BitWord - Punctuation - whitespace - End Enum - Public Enum TokenizerType - _Char - _Word - _Sentence - _Paragraph - _BPE - _Wordpiece - _Token - _TokenID - End Enum Public Class RemoveToken @@ -4450,7 +3032,7 @@ Public Module Extensions Next Next - Return Tokenizer.GetHighFreq(NgramCounts, Freq_threshold) + Return GetHighFreqLst(NgramCounts, Freq_threshold) End Function Public Shared Function GetTokenGramCounts(Tokens As List(Of String), N As Integer) As Dictionary(Of String, Integer) Dim NgramCounts As New Dictionary(Of String, Integer) @@ -4473,7 +3055,7 @@ Public Module Extensions Public Shared Function GetFrequentTokenNgrams(Tokens As List(Of String), N As Integer, ByRef Freq_threshold As Integer) As List(Of String) Dim NgramCounts As Dictionary(Of String, Integer) = GetTokenGramCounts(Tokens, N) - Dim frequentWordNgrams As List(Of String) = Tokenizer.GetHighFreq(NgramCounts, Freq_threshold) + Dim frequentWordNgrams As List(Of String) = GetHighFreqLst(NgramCounts, Freq_threshold) Return frequentWordNgrams End Function diff --git a/SourceCode/LanguageModels.vb b/SourceCode/LanguageModels.vb index 7b2efb0..6fe45eb 100644 --- a/SourceCode/LanguageModels.vb +++ b/SourceCode/LanguageModels.vb @@ -8,1799 +8,10 @@ Imports InputModelling.LanguageModels.BaseModels.LanguageModelFactory.NgramModel Imports InputModelling.LanguageModels.BaseModels.LanguageModelFactory.PredictiveLanguageModel Imports InputModelling.Utilitys Imports InputModelling.Utilitys.NN +Imports InputModelling.Utilitys.TEXT Namespace LanguageModels - ''' - ''' Corpus Language Model - ''' Used to HoldDocuments : a corpus of documents Calculating detecting the - ''' known entitys and topics in the model; - ''' A known list of Entitys and Topics are required to create this model - ''' This language model is ideally suited for NER / and other corpus interogations - ''' - ''' - Public Class Corpus - - Public Class iCompare - - Public Shared Function GetDistinctWords(text As String) As HashSet(Of String) - ' Split the text into words and return a HashSet of distinct words - Dim words() As String = text.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) - Dim distinctWords As New HashSet(Of String)(words, StringComparer.OrdinalIgnoreCase) - - Return distinctWords - End Function - - Public Shared Function BuildWordVector(words As HashSet(Of String)) As Dictionary(Of String, Integer) - Dim wordVector As New Dictionary(Of String, Integer) - - For Each word As String In words - If wordVector.ContainsKey(word) Then - wordVector(word) += 1 - Else - wordVector(word) = 1 - End If - Next - - Return wordVector - End Function - - '1. Cosine Similarity Calculation: - '```vb - Public Shared Function ComputeCosineSimilarity(phrase1 As String, phrase2 As String) As Double - Dim words1 As HashSet(Of String) = GetDistinctWords(phrase1) - Dim words2 As HashSet(Of String) = GetDistinctWords(phrase2) - - Dim wordVector1 As Dictionary(Of String, Integer) = BuildWordVector(words1) - Dim wordVector2 As Dictionary(Of String, Integer) = BuildWordVector(words2) - - Dim dotProduct As Integer = ComputeDotProduct(wordVector1, wordVector2) - Dim magnitude1 As Double = ComputeVectorMagnitude(wordVector1) - Dim magnitude2 As Double = ComputeVectorMagnitude(wordVector2) - - ' Compute the cosine similarity as the dot product divided by the product of magnitudes - Dim similarityScore As Double = dotProduct / (magnitude1 * magnitude2) - - Return similarityScore - End Function - - Public Shared Function ComputeDotProduct(vector1 As Dictionary(Of String, Integer), vector2 As Dictionary(Of String, Integer)) As Integer - Dim dotProduct As Integer = 0 - - For Each word As String In vector1.Keys - If vector2.ContainsKey(word) Then - dotProduct += vector1(word) * vector2(word) - End If - Next - - Return dotProduct - End Function - - '2. Jaccard Similarity Calculation: - '```vb - Public Shared Function ComputeJaccardSimilarity(phrase1 As String, phrase2 As String) As Double - Dim words1 As HashSet(Of String) = GetDistinctWords(phrase1) - Dim words2 As HashSet(Of String) = GetDistinctWords(phrase2) - - Dim intersectionCount As Integer = words1.Intersect(words2).Count() - Dim unionCount As Integer = words1.Count + words2.Count - intersectionCount - - ' Compute the Jaccard Similarity as the ratio of intersection count to union count - Dim similarityScore As Double = intersectionCount / unionCount - - Return similarityScore - End Function - - Public Shared Function ComputeSimilarityScore(phrase As String, contextLine As String) As Double - ' Here you can implement your own logic for computing the similarity score between the phrase and the context line. - ' For simplicity, let's use a basic approach that counts the number of common words between them. - - Dim phraseWords As HashSet(Of String) = GetDistinctWords(phrase) - Dim contextWords As HashSet(Of String) = GetDistinctWords(contextLine) - - Dim commonWordsCount As Integer = phraseWords.Intersect(contextWords).Count() - - Dim totalWordsCount As Integer = phraseWords.Count + contextWords.Count - - ' Compute the similarity score as the ratio of common words count to total words count - Dim similarityScore As Double = commonWordsCount / totalWordsCount - - Return similarityScore - End Function - - Public Shared Function ComputeVectorMagnitude(vector As Dictionary(Of String, Integer)) As Double - Dim magnitude As Double = 0 - - For Each count As Integer In vector.Values - magnitude += count * count - Next - - magnitude = Math.Sqrt(magnitude) - - Return magnitude - End Function - - End Class - - ''' - ''' Used to create NewCorpus - With Or Without a Recognition template - ''' - Public Class ProcessInputAPI - Private iCurrentOriginalText As String - Private KnownEntitys As Corpus.Recognition_Data - - Public Sub New(ByRef KnownData As Corpus.Recognition_Data) - Me.KnownEntitys = KnownData - End Sub - - Public Sub New() - KnownEntitys = New Corpus.Recognition_Data - End Sub - - Public ReadOnly Property CurrentInput As String - Get - Return iCurrentOriginalText - End Get - End Property - - Public Function ProcessDocument(ByRef InputText As String) As Corpus - Dim iCorpus As New Corpus(KnownEntitys) - iCorpus.AddDocument(InputText) - Return iCorpus - End Function - - Public Function ProcessCorpus(ByRef InputText As List(Of String)) As Corpus - Dim iCorpus As New Corpus(KnownEntitys) - iCorpus.AddCorpus(InputText) - Return iCorpus - End Function - - End Class - - Public Shared Function ExtractSimilarPhrases(text As String, searchPhrase As String, similarityThreshold As Double) As List(Of String) - Dim result As New List(Of String)() - - Dim sentences() As String = text.Split({".", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) - - For Each sentence As String In sentences - Dim similarityScore As Double = iCompare.ComputeSimilarityScore(searchPhrase, sentence) - - If similarityScore >= similarityThreshold Then - result.Add(sentence) - End If - Next - - Return result - End Function - - Public Shared Function QueryCorpus(question As String, corpus As List(Of String)) As String - Dim maxScore As Double = Double.MinValue - Dim bestAnswer As String = "" - - For Each document As String In corpus - Dim score As Double = iCompare.ComputeSimilarityScore(question, document) - - If score > maxScore Then - maxScore = score - bestAnswer = document - End If - Next - - Return bestAnswer - End Function - - ''' - ''' Returns phrase and surrounding comments and position - ''' - ''' - ''' - ''' - Public Shared Function SearchPhraseInCorpus(corpus As List(Of String), phrase As String) As Dictionary(Of String, List(Of String)) - Dim result As New Dictionary(Of String, List(Of String))() - - For i As Integer = 0 To corpus.Count - 1 - Dim document As String = corpus(i) - Dim lines() As String = document.Split(Environment.NewLine) - - For j As Integer = 0 To lines.Length - 1 - Dim line As String = lines(j) - Dim index As Integer = line.IndexOf(phrase, StringComparison.OrdinalIgnoreCase) - - While index >= 0 - Dim context As New List(Of String)() - - ' Get the surrounding context sentences - Dim startLine As Integer = Math.Max(0, j - 1) - Dim endLine As Integer = Math.Min(lines.Length - 1, j + 1) - - For k As Integer = startLine To endLine - context.Add(lines(k)) - Next - - ' Add the result to the dictionary - Dim position As String = $"Document: {i + 1}, Line: {j + 1}, Character: {index + 1}" - result(position) = context - - ' Continue searching for the phrase in the current line - index = line.IndexOf(phrase, index + 1, StringComparison.OrdinalIgnoreCase) - End While - Next - Next - - Return result - End Function - - ''' - ''' Searches for phrases based on simularity ie same words - ''' - ''' - ''' - ''' - ''' - Public Shared Function SearchPhraseInCorpus(corpus As List(Of String), phrase As String, similarityThreshold As Double) As Dictionary(Of String, List(Of String)) - Dim result As New Dictionary(Of String, List(Of String))() - - For i As Integer = 0 To corpus.Count - 1 - Dim document As String = corpus(i) - Dim lines() As String = document.Split(Environment.NewLine) - - For j As Integer = 0 To lines.Length - 1 - Dim line As String = lines(j) - Dim index As Integer = line.IndexOf(phrase, StringComparison.OrdinalIgnoreCase) - - While index >= 0 - Dim context As New List(Of String)() - - ' Get the surrounding context sentences - Dim startLine As Integer = Math.Max(0, j - 1) - Dim endLine As Integer = Math.Min(lines.Length - 1, j + 1) - - For k As Integer = startLine To endLine - Dim contextLine As String = lines(k) - - ' Compute the similarity score between the context line and the phrase - Dim similarityScore As Double = iCompare.ComputeSimilarityScore(phrase, contextLine) - - ' Add the context line only if its similarity score exceeds the threshold - If similarityScore >= similarityThreshold Then - context.Add(contextLine) - End If - Next - - ' Add the result to the dictionary - Dim position As String = $"Document: {i + 1}, Line: {j + 1}, Character: {index + 1}" - result(position) = context - - ' Continue searching for the phrase in the current line - index = line.IndexOf(phrase, index + 1, StringComparison.OrdinalIgnoreCase) - End While - Next - Next - - Return result - End Function - - Public Function ToJson(ByRef iObject As Object) As String - Dim Converter As New JavaScriptSerializer - Return Converter.Serialize(iObject) - End Function - - Public Class Tokenizer - - ''' - ''' Normalizes the input string by converting it to lowercase and removing punctuation and extra whitespace. - ''' - ''' The input string. - ''' The normalized input string. - Public Function NormalizeInput(input As String) As String - ' Convert to lowercase - Dim normalizedInput As String = input.ToLower() - - ' Remove punctuation - normalizedInput = Regex.Replace(normalizedInput, "[^\w\s]", "") - - ' Remove extra whitespace - normalizedInput = Regex.Replace(normalizedInput, "\s+", " ") - - Return normalizedInput - End Function - - ''' - ''' Tokenizes the input string by character. - ''' - ''' The input string. - ''' The list of character tokens. - Public Shared Function TokenizeByCharacter(input As String) As List(Of Token) - Dim tokens As New List(Of Token) - - For i As Integer = 0 To input.Length - 1 - Dim token As New Token(input(i).ToString()) - tokens.Add(token) - Next - - Return tokens - End Function - - ''' - ''' Tokenizes the input string by word. - ''' - ''' The input string. - ''' The list of word tokens. - Public Shared Function TokenizeByWord(input As String) As List(Of Token) - Dim tokens As New List(Of Token) - Dim words As String() = input.Split(" "c) - - For i As Integer = 0 To words.Length - 1 - Dim token As New Token(words(i)) - tokens.Add(token) - Next - - Return tokens - End Function - - ''' - ''' Tokenizes the input string by sentence. - ''' - ''' The input string. - ''' The list of sentence tokens. - Public Shared Function TokenizeBySentence(input As String) As List(Of Token) - Dim tokens As New List(Of Token) - Dim sentences As String() = input.Split("."c) - - For i As Integer = 0 To sentences.Length - 1 - Dim token As New Token(sentences(i)) - tokens.Add(token) - Next - - Return tokens - End Function - - ''' - ''' Tokenizes the input string by whitespace. - ''' - ''' The input string. - ''' The list of tokens. - Public Shared Function Tokenize(input As String) As List(Of String) - ' Simple tokenization by splitting on whitespace - Return New List(Of String)(input.Split({" "c}, StringSplitOptions.RemoveEmptyEntries)) - End Function - - Public Class Token - - ''' - ''' Initializes a new instance of the Token class. - ''' - ''' The string value of the token. - Public Sub New(value As String) - If value Is Nothing Then - Throw New ArgumentNullException(NameOf(value)) - End If - - Me.Value = value - End Sub - - ''' - ''' Initializes a new instance of the Token class with sequence encoding. - ''' - ''' The string value of the token. - ''' The sequence encoding value of the token. - Public Sub New(value As String, sequenceEncoding As Integer) - Me.New(value) - Me.SequenceEncoding = sequenceEncoding - End Sub - - ''' - ''' Gets or sets the embeddings of the token. - ''' - Public Property Embeddings As List(Of Double) - - ''' - ''' Calculates the similarity between this token and the given token. - ''' - ''' The other token. - ''' The similarity value between the tokens. - Private Function CalculateSimilarity(token As Token) As Double - If Embeddings IsNot Nothing AndAlso token.Embeddings IsNot Nothing Then - Dim dotProduct As Double = 0.0 - Dim magnitudeA As Double = 0.0 - Dim magnitudeB As Double = 0.0 - - For i As Integer = 0 To Embeddings.Count - 1 - dotProduct += Embeddings(i) * token.Embeddings(i) - magnitudeA += Math.Pow(Embeddings(i), 2) - magnitudeB += Math.Pow(token.Embeddings(i), 2) - Next - - magnitudeA = Math.Sqrt(magnitudeA) - magnitudeB = Math.Sqrt(magnitudeB) - - If magnitudeA = 0.0 OrElse magnitudeB = 0.0 Then - Return 0.0 - Else - Return dotProduct / (magnitudeA * magnitudeB) - End If - Else - Return 0.0 - End If - End Function - - ''' - ''' Gets or sets the string value of the token. - ''' - Public Property Value As String - - ''' - ''' Gets or sets the sequence encoding value of the token. - ''' - Public Property SequenceEncoding As Integer - - ''' - ''' Gets or sets the positional encoding value of the token. - ''' - Public Property PositionalEncoding As Integer - - ''' - ''' Gets or sets the frequency of the token in the language model corpus. - ''' - Public Property Frequency As Double - - ''' - ''' Gets or sets the embedding vector of the token. - ''' - Public Property Embedding As Double - - Public Function CalculateSelfAttention(tokens As List(Of Token)) As Double - Dim total As Double = 0.0 - For Each token As Token In tokens - total += CalcSimilarity(token) - Next - Return Math.Log(Math.Sqrt(total)) - End Function - - Private Function CalcSimilarity(token As Token) As Double - If Embeddings IsNot Nothing AndAlso token.Embeddings IsNot Nothing Then - Dim dotProduct As Double = 0.0 - For i As Integer = 0 To Embeddings.Count - 1 - dotProduct += Embeddings(i) * token.Embeddings(i) - Next - Return dotProduct - End If - Return 0.0 - End Function - - ''' - ''' Calculates the self-attention of the token within the given list of tokens. - ''' - ''' The list of tokens. - ''' The self-attention value of the token. - Public Function CalculateAttention(tokens As List(Of Token)) As Double - Dim qVector As List(Of Double) = Me.Embeddings - Dim kMatrix As New List(Of Double) - Dim vMatrix As New List(Of Double) - - ' Create matrices K and V - For Each token In tokens - kMatrix.Add(token.Embedding) - vMatrix.Add(token.Embedding) - Next - - ' Compute self-attention - Dim attention As Double = 0.0 - Dim sqrtKLength As Double = Math.Sqrt(kMatrix(0)) - - For i As Integer = 0 To kMatrix.Count - 1 - Dim kVector As List(Of Double) = kMatrix - Dim dotProduct As Double = 0.0 - - ' Check vector dimensions - If qVector.Count = kVector.Count Then - For j As Integer = 0 To qVector.Count - 1 - dotProduct += qVector(j) * kVector(j) - Next - - dotProduct /= sqrtKLength - attention += dotProduct * vMatrix(i) ' We consider only the first element of the value vector for simplicity - Else - ' Handle case when vector dimensions do not match - Console.WriteLine("Vector dimensions do not match.") - End If - Next - - Return attention - End Function - - End Class - - End Class - - ''' - ''' An array of characters (. ! ?) used to tokenize sentences. - ''' - Public Shared ReadOnly SentenceEndMarkers As Char() = {".", "!", "?"} - - Public CorpusContext As List(Of Vocabulary.FeatureContext) - - ''' - ''' A list of strings representing the documents in the corpus. - ''' - Public CorpusDocs As List(Of String) - - ''' - ''' A string representing the concatenated text of all documents in the corpus. - ''' - Public CorpusText As String - - ''' - ''' A list of unique words in the corpus. - ''' - Public CorpusUniqueWords As List(Of String) - - ''' - ''' TotalWords in Corpus - ''' - Public ReadOnly Property CorpusWordcount As Integer - Get - Return GetWordCount() - End Get - End Property - - ''' - ''' A list of Document objects representing individual documents in the corpus. - ''' - Public Documents As List(Of Document) - - ''' - ''' A list of Entity structures representing detected entities in the corpus. - ''' - Public Entitys As List(Of Entity) - - ''' - ''' A Vocabulary object representing the language model. - ''' - Public Langmodel As Vocabulary - - ''' - ''' A Recognition_Data structure representing named entity recognition data. - ''' - Public NER As Recognition_Data - - ''' - ''' A list of Topic structures representing detected topics in the corpus. - ''' - Public Topics As List(Of Topic) - - ''' - ''' Initializes a new instance of the Corpus class. - ''' - ''' The recognition data for entity and topic detection. - Public Sub New(ByVal data As Recognition_Data) - NER = data - Documents = New List(Of Document) - Entitys = New List(Of Entity) - Topics = New List(Of Topic) - CorpusDocs = New List(Of String) - CorpusUniqueWords = New List(Of String) - CorpusText = String.Empty - - Langmodel = New Vocabulary - End Sub - - ''' - ''' type of sentence - ''' - Public Enum SentenceType - Unknown = 0 - Declaritive = 1 - Interogative = 2 - Exclamitory = 3 - Conditional = 4 - Inference = 5 - Imperitive = 6 - End Enum - - ''' - ''' Processes the text by removing unwanted characters, converting to lowercase, and removing extra whitespace. - ''' - ''' - ''' - Public Shared Function ProcessText(ByRef text As String) As String - ' Remove unwanted characters - Dim processedText As String = Regex.Replace(text, "[^a-zA-Z0-9\s]", "") - - ' Convert to lowercase - processedText = processedText.ToLower() - - ' Remove extra whitespace - processedText = Regex.Replace(processedText, "\s+", " ") - - Return processedText - End Function - - ''' - ''' Adds a corpus of documents to the existing corpus. - ''' - ''' - ''' - Public Function AddCorpus(ByRef docs As List(Of String)) As Corpus - - 'Add aCorpus of documents to the corpus - - For Each item In docs - - AddDocument(item) - - Next - UpdateContext() - Return Me - - End Function - - ''' - ''' Adds a document to the corpus and updates the corpus properties. - ''' - ''' - Public Sub AddDocument(ByRef Text As String) - Dim Doc As New Document(Text) - Documents.Add(Doc.AddDocument(ProcessText(Text))) - 'Update Corpus - CorpusDocs.Add(ProcessText(Text)) - - CorpusUniqueWords = GetUniqueWords() - - Dim iText As String = "" - For Each item In Documents - iText &= item.ProcessedText & vbNewLine - - Next - CorpusText = iText - - '' corpus entitys and topics - Doc.Entitys = Entity.DetectEntitys(Doc.ProcessedText, NER.Entitys) - Doc.Topics = Topic.DetectTopics(Doc.ProcessedText, NER.Topics) - Entitys.AddRange(Doc.Entitys) - Entitys = Entitys - - Topics.AddRange(Doc.Topics) - Topics = Topics - 'Update VocabModel - - Dim Wrds = Text.Split(" ") - - For Each item In Wrds - Langmodel.AddNew(item, CorpusDocs) - Next - End Sub - - ''' - ''' Retrieves the list of unique words in the corpus. - ''' - ''' - Public Function GetUniqueWords() As List(Of String) - Dim lst As New List(Of String) - For Each item In Documents - lst.AddRange(item.UniqueWords) - Next - Return lst - End Function - - ''' - ''' Retrieves the total word count in the corpus. - ''' - ''' - Public Function GetWordCount() As Integer - Dim count As Integer = 0 - For Each item In Documents - count += item.WordCount - Next - Return count - End Function - - ''' - ''' Updates the Features in the model (each document context) - ''' by the topics discovered in the text, updating the individual documents and adding the - ''' feature context to the corpus context - ''' - Private Sub UpdateContext() - CorpusContext = New List(Of Vocabulary.FeatureContext) - For Each Topic In Topics.Distinct - For Each doc In Documents - Dim Context = Vocabulary.FeatureContext.GetDocumentContext(Langmodel, doc, Topic.Topic) - doc.Features.Add(Context) - CorpusContext.Add(Context) - Next - Next - - End Sub - - ''' - ''' Represents an individual document in the corpus. It contains properties such as word count, processed text, sentences, topics, etc. - ''' - Public Structure Document - - Public ReadOnly Property WordCount As Integer - Get - Return GetWordCount() - End Get - End Property - - Private Function GetWordCount() As Integer - Dim Str = Functs.TokenizeWords(OriginalText) - Return Str.Count - End Function - - '''' - '''' COntains the Vocabulary for this document - '''' - Public DocumentVocabulary As Vocabulary - - Public Entitys As List(Of Entity) - - ''' - ''' Context can be updated by the corpus owner as required, these contexts - ''' can be used to score the document and provided higher embedding values - ''' - Public Features As List(Of Vocabulary.FeatureContext) - - ''' - ''' Preserve original - ''' - Public OriginalText As String - - ''' - ''' Cleaned Text - ''' - Public ProcessedText As String - - ''' - ''' Sentences within Text - ''' - Public Sentences As List(Of Sentence) - - Public Topics As List(Of Topic) - Public TopWords As List(Of String) - Public UniqueWords As List(Of String) - - Public Sub New(ByRef originalText As String) - - Me.OriginalText = originalText - Topics = New List(Of Topic) - TopWords = New List(Of String) - UniqueWords = New List(Of String) - Sentences = New List(Of Sentence) - DocumentVocabulary = New Vocabulary - Entitys = New List(Of Entity) - End Sub - - Public Function AddDocument(ByRef Text As String) As Document - OriginalText = Text - 'Remove unwanted symbols - ProcessedText = ProcessText(Text) - - Dim Sents As List(Of String) = Text.Split(".").ToList - Dim Count As Integer = 0 - For Each item In Sents - Count += 1 - Dim Sent As New Sentence(item) - Me.Sentences.Add(Sent.AddSentence(item, Count)) - Next - UniqueWords = Corpus.Functs.GetUniqueWordsInText(ProcessedText) - Dim IDocs As New List(Of String) - 'Adds only its-self to its own personal corpus vocabulary(document Specific) - IDocs.Add(ProcessedText) - For Each item In UniqueWords - DocumentVocabulary.AddNew(item, IDocs) - Next - TopWords = Corpus.Functs.GetTopWordsInText(ProcessedText) - - Return Me - End Function - - Public Structure Sentence - - Public Clauses As List(Of Clause) - - Public Entitys As List(Of Entity) - - Public OriginalSentence As String - - Public Position As Integer - - Public ProcessedSentence As String - - Public UniqueWords As List(Of String) - - Private iSentencetype As SentenceType - - Public Sub New(originalSentence As String) - Me.New() - Me.OriginalSentence = originalSentence - Clauses = New List(Of Clause) - Entitys = New List(Of Entity) - UniqueWords = New List(Of String) - End Sub - - Public ReadOnly Property ClauseCount As Integer - Get - Return Clauses.Count - End Get - - End Property - - Public ReadOnly Property SentenceType As String - Get - Select Case iSentencetype - Case Corpus.SentenceType.Conditional - Return "Conditional" - Case Corpus.SentenceType.Declaritive - Return "Declarative" - Case Corpus.SentenceType.Exclamitory - Return "exclamatory" - Case Corpus.SentenceType.Imperitive - Return "imperative" - Case Corpus.SentenceType.Inference - Return "inference" - Case Corpus.SentenceType.Interogative - Return "interrogative" - Case Corpus.SentenceType.Unknown - Return "unknown" - Case Else - Return "unknown" - End Select - End Get - End Property - - Public ReadOnly Property WordCount As Integer - Get - Return GetWordCount(ProcessedSentence) - End Get - End Property - - Public Shared Function GetClauses(ByRef Text As String) As List(Of Clause) - Dim clauses As New List(Of Clause) - - ' - - If Text.Contains(",") Then - Dim iClauses As List(Of String) = Text.Split(",").ToList - For Each item In iClauses - Dim Iclause As New Clause - Iclause.Text = item - Iclause.ClauseSeperator = "," - Dim Words = Functs.TokenizeWords(Iclause.Text) - Dim count As Integer = 0 - For Each wrd In Words - count += 1 - Iclause.Words.Add(New Clause.Word(wrd, count)) - - Next - - clauses.Add(Iclause) - - Next - Else - - 'Add detect end punctuation use for - - Dim Iclause As New Clause - Iclause.Words = New List(Of Clause.Word) - Iclause.Text = Text - 'Use end punctuation - Iclause.ClauseSeperator = "." - Dim Words = Functs.TokenizeWords(Iclause.Text) - Dim count As Integer = 0 - If Words.Count > 0 Then - For Each wrd In Words - - count += 1 - Iclause.Words.Add(New Clause.Word(wrd, count)) - - Next - End If - clauses.Add(Iclause) - - End If - Return clauses - End Function - - Public Function AddSentence(ByRef text As String, ByRef iPosition As Integer) As Sentence - OriginalSentence = text - ProcessedSentence = ProcessText(text) - Clauses = GetClauses(ProcessedSentence) - UniqueWords = Corpus.Functs.GetUniqueWordsInText(ProcessedSentence) - - Position = iPosition - Return Me - End Function - - Private Function GetWordCount(ByRef Text As String) As Integer - Dim Str = Functs.TokenizeWords(Text) - Return Str.Count - End Function - - ''' - ''' Represents a clause within a sentence. It contains properties such as text, word count, words, etc. - ''' - Public Structure Clause - - ''' - ''' Independent Clause / Dependant Clause - ''' - Public Clause As String - - Public ClauseSeperator As String - Public ClauseType As SentenceType - - ''' - ''' Note: if = "." then declarative, = "?" Question = "!" Exclamitory - ''' - Public EndPunctuation As String - - Public Text As String - Public Words As List(Of Clause.Word) - Private mLearningPattern As String - - Private mPredicate As String - - Private mSubjectA As String - - Private mSubjectB As String - - ''' - ''' the learning pattern locates the Subjects in the sentence A# sat on #b - ''' - ''' - Public Property LearningPattern As String - Get - Return mLearningPattern - End Get - Set(value As String) - mLearningPattern = value - End Set - End Property - - ''' - ''' Predicate / Linking verb / Concept (Sat on) (is sitting on) (AtLocation) this is the - ''' dividing content in the sentence - ''' - ''' - Public Property Predicate As String - Get - Return mPredicate - End Get - Set(value As String) - mPredicate = value - End Set - End Property - - ''' - ''' First detected subject (the Cat) - ''' - ''' - Public Property SubjectA As String - Get - Return mSubjectA - End Get - Set(value As String) - mSubjectA = value - End Set - End Property - - ''' - ''' Second detected subject / Object (the mat) - ''' - ''' - Public Property SubjectB As String - Get - Return mSubjectB - End Get - Set(value As String) - mSubjectB = value - End Set - End Property - - Public ReadOnly Property WordCount As Integer - Get - Return Words.Count - End Get - - End Property - - ''' - ''' Represents a word in the text - ''' - Public Structure Word - - ''' - ''' Position of word in Sentence/Document - ''' - Public Position As Integer - - ''' - ''' Word - ''' - Public text As String - - Public Sub New(word As String, position As Integer) - If word Is Nothing Then - Throw New ArgumentNullException(NameOf(word)) - End If - - Me.text = word - Me.Position = position - - End Sub - - End Structure - - End Structure - - End Structure - - End Structure - - ''' - ''' NER Data held(known) by the corpus - ''' - Public Class Recognition_Data - Public Entitys As List(Of Entity) - Public Topics As List(Of Topic) - - Public Sub New() - Entitys = New List(Of Entity) - Topics = New List(Of Topic) - End Sub - - End Class - - Public Structure Term - Public DocNumber As List(Of Integer) - - ''' - ''' Term Frequency - ''' - Dim Freq As Integer - - ''' - ''' Inverse Document Frequency - ''' - Dim IDF As Double - - ''' - ''' Value - ''' - Dim Term As String - - End Structure - - ''' - ''' Represents a topic detected in the text. It has properties for the keyword and topic itself. - ''' - Public Structure Topic - Public Keyword As String - Public Topic As String - - Public Shared Function DetectTopics(ByRef text As String, TopicList As List(Of Topic)) As List(Of Topic) - Dim detectedTopics As New List(Of Topic)() - For Each item In TopicList - If text.ToLower.Contains(item.Keyword) Then - detectedTopics.Add(item) - End If - Next - - Return detectedTopics - End Function - - End Structure - - Public Class Functs - - ''' - ''' Returns the top words in a given text - ''' - ''' - ''' - Public Shared Function GetTopWordsInText(ByRef text As String) As List(Of String) - Dim words As List(Of String) = Functs.TokenizeWords(text) - Dim wordCounts As New Dictionary(Of String, Integer)() - - For Each word As String In words - If wordCounts.ContainsKey(word) Then - wordCounts(word) += 1 - Else - wordCounts(word) = 1 - End If - Next - - ' Sort the words based on their counts in descending order - Dim sortedWords As List(Of KeyValuePair(Of String, Integer)) = wordCounts.OrderByDescending(Function(x) x.Value).ToList() - - ' Get the top 10 words - Dim topWords As List(Of String) = sortedWords.Take(10).Select(Function(x) x.Key).ToList() - - Return topWords - End Function - - ''' - ''' Returns a list of the unique words in the text - ''' - ''' - ''' - Public Shared Function GetUniqueWordsInText(ByRef text As String) As List(Of String) - Dim words As List(Of String) = Functs.TokenizeWords(text) - Dim uniqueWords As List(Of String) = words.Distinct().ToList() - Return uniqueWords - End Function - - Public Shared Sub PrintSentencesToConsole(ByRef iSentences As List(Of String)) - For Each sentence In iSentences - Console.WriteLine(sentence) - Next - End Sub - - ''' - ''' Tokenizes the text into sentences based on punctuation end markers. - ''' - ''' The text to tokenize. - ''' A list of sentences. - Public Shared Function TokenizeSentences(ByVal text As String) As List(Of Document.Sentence) - Dim sentences As New List(Of Document.Sentence)() - - ' Split text into sentences based on punctuation end markers - Dim pattern As String = $"(?<=[{String.Join("", SentenceEndMarkers)}])\s+" - Dim sentenceTexts As String() = Regex.Split(text, pattern) - - For Each sentenceText As String In sentenceTexts - Dim isentence As New Document.Sentence() - isentence.OriginalSentence = sentenceText.Trim() - - isentence.Clauses = Document.Sentence.GetClauses(text) - ' ... other sentence properties ... - sentences.Add(isentence) - Next - - Return sentences - End Function - - ''' - ''' Tokenizes the sentence into words. - ''' - ''' The text of the sentence. - ''' A list of words. - Public Shared Function TokenizeWords(ByVal sentenceText As String) As List(Of String) - Dim words As New List(Of String)() - - ' Split sentence into words - Dim wordPattern As String = "\b\w+\b" - Dim wordMatches As MatchCollection = Regex.Matches(sentenceText, wordPattern) - - For Each match As Match In wordMatches - words.Add(match.Value.ToLower()) - Next - - Return words - End Function - - Public Shared Function Top_N_Words(ByRef iDocContents As String, ByRef n As Integer) As List(Of String) - Dim words As String() = iDocContents.Split(" ") - Dim wordCount As New Dictionary(Of String, Integer) - - ' Count the frequency of each word in the corpus - For Each word As String In words - If wordCount.ContainsKey(word) Then - wordCount(word) += 1 - Else - wordCount.Add(word, 1) - End If - Next - - ' Sort the dictionary by value (frequency) in descending order - Dim sortedDict = (From entry In wordCount Order By entry.Value Descending Select entry).Take(n) - Dim LSt As New List(Of String) - ' Print the top ten words and their frequency - For Each item In sortedDict - LSt.Add(item.Key) - - Next - Return LSt - End Function - - End Class - - ''' - ''' Represents the vocabulary model for the corpus. - ''' (a record of words which can be looked up in the corpus) - ''' It includes methods for adding new terms, calculating frequencies, TF-IDF, etc. - ''' - Public Class Vocabulary - Public Current As List(Of VocabularyEntry) - - ''' - ''' Used for TDM Calc - ''' - Private Docs As List(Of String) - - ''' - ''' Prepare vocabulary for use - ''' - Public Sub New() - Current = New List(Of VocabularyEntry) - Docs = New List(Of String) - End Sub - - ''' - ''' Used to add Words or update a word in the vocabulary language model - ''' - ''' - ''' - Public Sub AddNew(ByRef Term As String, ByRef Docs As List(Of String)) - Me.Docs = Docs - Current.Add(New VocabularyEntry(Term, - CalcSequenceEncoding(Term), - CalcFrequency(Term), - CalcTF_IDF(Term))) - - End Sub - - Private Function CalcFrequency(ByRef Word As String) As Double - ' Calculate frequency of term in the corpus (current) - Dim count As Integer = 0 - For Each entry In Current - If entry.Text = Word Then - - count += 1 + entry.Frequency - Else - Return 1 - End If - Next - Return count - End Function - - Public Function GetEntry(ByRef Token As String) As VocabularyEntry - For Each item In Current - If item.Text = Token Then Return item - Next - Return Nothing - End Function - - Public Function GetEntry(ByRef SequenceEmbedding As Integer) As VocabularyEntry - For Each item In Current - If item.SequenceEncoding = SequenceEmbedding Then Return item - Next - Return Nothing - End Function - - Public Function CheckEntry(ByRef Token As String) As Boolean - For Each item In Current - If item.Text = Token Then Return True - Next - Return False - End Function - - Private Function CalcInverseDocumentFrequency(ByRef Word As String, ByRef Docs As List(Of String)) As Double - ' Calculate Inverse Document Frequency for the given term in the corpus - Dim docsWithTerm As Integer = 0 - For Each doc In Docs - If doc.Contains(Word) Then - docsWithTerm += 1 - End If - Next - Dim idf As Double = Math.Log(Docs.Count / (docsWithTerm + 1)) ' Adding 1 to avoid division by zero - Return idf - End Function - - Private Function CalcSequenceEncoding(ByRef Word As String) As Double - ' Calculate sequence encoding based on the order of appearance in the corpus - Dim encoding As Double = 0.0 - For Each entry In Current - If entry.Text = Word Then - encoding += 1 - End If - Next - Return encoding - End Function - - Private Function CalcTermFrequency(ByRef Word As String) As Double - ' Calculate Term Frequency for the given term in the corpus - Dim count As Integer = 0 - For Each entry In Current - If entry.Text = Word Then - count += 1 - End If - Next - Return count - End Function - - Private Function CalcTF_IDF(ByRef Word As String) As Double - ' Calculate TF-IDF (Term Frequency-Inverse Document Frequency) for the given term in the corpus - Dim tf As Double = CalcTermFrequency(Word) - Dim idf As Double = CalcInverseDocumentFrequency(Word, Docs) - Return tf * idf - End Function - - ''' - ''' Feature context is a way to add information with regards to the document, - ''' Addind context elements such as features. - ''' Given a Sentiment (positive) , by marking the words in this document - ''' present against the corpus vocabulary, it could be suggested that these would - ''' represent that topic in this document - ''' - Public Structure FeatureContext - - ''' - ''' List of items Representing the context, - ''' All entrys contained in the vocabulary are marked with a tag (present)(true) - ''' if the are in the context else marked false - ''' giving a one-shot encoding for the context this collection represents, - ''' Ie:Sentiment/Topic etc - ''' - Public Present As List(Of VocabularyEntry) - - Public Type As String - - ''' - ''' Encodes a Feature into the model, - ''' Provide the label and the document words in the document - ''' will be marked present in the context - ''' Later these Oneshot encoding feature maybe used to increase the scoring vectors - ''' Adding context to the document for a specific feature such as sentiment / Emotion / Topic. - ''' Each topic should be encoded as a feature in the document - ''' - ''' - ''' Current Vocabulary - ''' - ''' - ''' - Public Shared Function GetDocumentContext(ByRef CorpusVocab As Vocabulary, ByRef iDocument As Document, ByRef Label As String) As Vocabulary.FeatureContext - Dim iContext As New Vocabulary.FeatureContext - Dim NewVocab As List(Of Vocabulary.VocabularyEntry) = CorpusVocab.Current - - For Each item In NewVocab - For Each _item In iDocument.UniqueWords - If item.Text = _item Then - 'Encode Presence in text - item.Present = True - End If - Next - Next - iContext.Present = NewVocab - iContext.Type = Label - Return iContext - End Function - - End Structure - - Public Structure InputTextRecord - Public Text As String - Public Encoding As List(Of Integer) - Public Inputblocks As List(Of List(Of Integer)) - Public Targetblocks As List(Of List(Of Integer)) - Public blocksize As Integer - - Public Shared Function GetBlocks(ByRef Embedding As List(Of Integer), ByRef Size As Integer, Optional Ofset As Integer = 0) As List(Of List(Of Integer)) - Dim pos As Integer = 0 - Dim newPos As Integer = Size - Dim blocks As New List(Of List(Of Integer)) - Dim block As New List(Of Integer) - Do While pos < Embedding.Count - 1 - For i = pos To newPos - 1 - If Ofset > 0 Then - If i + Ofset < Embedding.Count - 1 Then - - block.Add(Embedding(i + Ofset)) - 'block.Add(Embedding(i)) - Else - block.Add(Embedding(i)) - End If - Else - block.Add(Embedding(i)) - End If - - Next - blocks.Add(block) - block = New List(Of Integer) - pos = newPos - - If newPos + Size < Embedding.Count - 1 Then - newPos += Size - Else - newPos = Embedding.Count - End If - - Loop - - Return blocks - End Function - - Public Shared Function GetTargetBlocks(ByRef Embedding As List(Of Integer), ByRef Size As Integer) As List(Of List(Of Integer)) - Dim pos As Integer = 0 - Dim newPos As Integer = Size - Dim blocks As New List(Of List(Of Integer)) - Dim block As New List(Of Integer) - Do While pos < Embedding.Count - 1 - For i = pos To newPos - 1 - block.Add(Embedding(i)) - - Next - blocks.Add(block) - block = New List(Of Integer) - pos = newPos - If newPos + Size < Embedding.Count - 1 Then - newPos += Size - Else - newPos = Embedding.Count - End If - - Loop - - Return blocks - End Function - - End Structure - - Public Class Encode - - Public Shared Function Encode_Text(ByRef Text As String, ByRef Vocab As List(Of VocabularyEntry), ByRef Type As VocabularyType) As List(Of Integer) - Dim iOutput As New List(Of Integer) - Select Case Type - Case VocabularyType.Character - Dim Chars = Tokenizer.TokenizeByCharacter(Text) - - For Each item In Chars - If CheckVocabulary(item.Value.ToLower, Vocab) = True Then - iOutput.Add(Decode.DecodeText(item.Value.ToLower, Vocab)) - End If - Next - Case VocabularyType.Word - Dim Words = Tokenizer.TokenizeByWord(Text) - - For Each item In Words - If CheckVocabulary(item.Value.ToLower, Vocab) = True Then - iOutput.Add(Decode.DecodeText(item.Value.ToLower, Vocab)) - End If - Next - Case VocabularyType.Sentence - Dim Sents = Tokenizer.TokenizeBySentence(Text) - - For Each item In Sents - If CheckVocabulary(item.Value, Vocab) = True Then - iOutput.Add(Decode.DecodeText(item.Value.ToLower, Vocab)) - End If - Next - End Select - Return iOutput - End Function - - Public Shared Function EncodeChars(VocabList As List(Of String)) As List(Of VocabularyEntry) - Dim vocabulary As New List(Of VocabularyEntry) - Dim EncodingValue As Integer = 1 - For Each item In VocabList - Dim newVocabRecord As New VocabularyEntry - newVocabRecord.Encoding = EncodingValue - newVocabRecord.Text = item - EncodingValue += 1 - vocabulary.Add(newVocabRecord) - Next - Return vocabulary - End Function - - Public Shared Function EncodeWords(VocabList As List(Of String)) As List(Of VocabularyEntry) - Dim vocabulary As New List(Of VocabularyEntry) - Dim EncodingValue As Integer = 1 - For Each item In VocabList - Dim newVocabRecord As New VocabularyEntry - newVocabRecord.Encoding = EncodingValue - newVocabRecord.Text = item - EncodingValue += 1 - vocabulary.Add(newVocabRecord) - Next - Return vocabulary - End Function - - Public Shared Function AddNewEncoding(ByRef Word As String, ByRef Vocab As List(Of VocabularyEntry)) As List(Of VocabularyEntry) - Dim NewVocab As New List(Of VocabularyEntry) - If CheckVocabulary(Word, Vocab) = False Then - NewVocab = Vocab - Dim NewItem As New VocabularyEntry - NewItem.Text = Word - NewItem.Encoding = Vocab.Count - NewVocab.Add(NewItem) - Return NewVocab - Else - Return Vocab - End If - End Function - - Public Shared Function CheckVocabulary(ByRef Word As String, ByRef Vocab As List(Of VocabularyEntry)) As Boolean - - For Each item In Vocab - If item.Text = Word Then - Return True - End If - Next - Return False - End Function - - End Class - - Public Class Decode - - Public Shared Function DecodeInteger(ByRef Lookup As Integer, ByRef Vocabulary As List(Of VocabularyEntry)) - For Each item In Vocabulary - If item.Encoding = Lookup Then - Return item.Text - End If - Next - Return "Not found in vocabulary" - End Function - - Public Shared Function DecodeText(ByRef Lookup As String, ByRef Vocabulary As List(Of VocabularyEntry)) - For Each item In Vocabulary - If item.Text = Lookup Then - Return item.Encoding - End If - Next - Return "Not found in vocabulary" - End Function - - End Class - - Public Class VocabularyEntry - Public Text As String - Public Encoding As Integer - Public Frequency As Integer - Public Present As Boolean - Public SequenceEncoding As Integer - Public TF_IDF As Double - - Public Sub New() - - End Sub - - Public Sub New(text As String, sequenceEncoding As Integer, frequency As Integer, tF_IDF As Double) - If text Is Nothing Then - Throw New ArgumentNullException(NameOf(text)) - End If - - Me.Text = text - Me.SequenceEncoding = sequenceEncoding - Me.Frequency = frequency - Me.TF_IDF = tF_IDF - End Sub - - End Class - - Public Enum VocabularyType - Character - Word - Sentence - End Enum - - Private Shared Function CreateCharVocabulary(ByRef text As String) As List(Of VocabularyEntry) - - Dim RecordList = CreateUniqueChars(text) - - Dim vocabulary As List(Of VocabularyEntry) = Encode.EncodeChars(RecordList) - Return vocabulary - End Function - - Private Shared Function CreateWordVocabulary(ByRef text As String) As List(Of VocabularyEntry) - - Dim RecordList = CreateUniqueWords(text) - - Dim vocabulary As List(Of VocabularyEntry) = Encode.EncodeWords(RecordList) - Return vocabulary - End Function - - Private Shared Function CreateSentenceVocabulary(ByRef text As String) As List(Of VocabularyEntry) - - Dim RecordList = CreateUniqueSentences(text) - - Dim vocabulary As List(Of VocabularyEntry) = Encode.EncodeWords(RecordList) - Return vocabulary - End Function - - Public Shared Function UpdateVocabulary(ByRef Text As String, ByRef vocab As List(Of VocabularyEntry)) - Return Encode.AddNewEncoding(Text, vocab) - End Function - - Public Shared Function CreateUniqueSentences(ByRef Text As String) As List(Of String) - Dim Words = Tokenizer.TokenizeBySentence(Text) - Dim WordList As New List(Of String) - For Each item In Words - If WordList.Contains(item.Value.ToLower) = False Then - WordList.Add(item.Value.ToLower) - End If - - Next - - Return WordList - End Function - - Public Shared Function CreateUniqueWords(ByRef Text As String) As List(Of String) - Dim Words = Tokenizer.TokenizeByWord(Text) - Dim WordList As New List(Of String) - For Each item In Words - If WordList.Contains(item.Value.ToLower) = False Then - WordList.Add(item.Value.ToLower) - End If - - Next - - Return WordList - End Function - - Public Shared Function CreateUniqueChars(ByRef Text As String) As List(Of String) - Dim Chars = Tokenizer.TokenizeByCharacter(Text) - Dim CharList As New List(Of String) - For Each item In Chars - If CharList.Contains(item.Value.ToLower) = False Then - CharList.Add(item.Value.ToLower) - End If - - Next - - Return CharList - End Function - - Public Shared Function CreateVocabulary(ByRef Text As String, vType As VocabularyType) As List(Of VocabularyEntry) - Select Case vType - Case VocabularyType.Character - Return CreateCharVocabulary(Text) - Case VocabularyType.Word - Return CreateWordVocabulary(Text) - Case VocabularyType.Sentence - Return CreateSentenceVocabulary(Text) - End Select - Return New List(Of VocabularyEntry) - End Function - - End Class - - ' Positional Encoding : - ' To provide positional information to the model, positional encodings. - ' These encodings are added to the input embeddings to capture the order of the tokens in the sequence. - ' Positional Encoding : - ' To provide positional information to the model, positional encodings. - ' These encodings are added to the input embeddings to capture the order of the tokens in the sequence. - Public Class PositionalEncoding - Private ReadOnly encodingMatrix As List(Of List(Of Double)) - Private InternalVocab As Corpus.Vocabulary - - Public Sub New(maxLength As Integer, embeddingSize As Integer, ByRef Vocab As Corpus.Vocabulary) - InternalVocab = Vocab - encodingMatrix = New List(Of List(Of Double))() - ' Create the encoding matrix - For pos As Integer = 0 To maxLength - 1 - Dim encodingRow As List(Of Double) = New List(Of Double)() - For i As Integer = 0 To embeddingSize - 1 - Dim angle As Double = pos / Math.Pow(10000, (2 * i) / embeddingSize) - encodingRow.Add(Math.Sin(angle)) - encodingRow.Add(Math.Cos(angle)) - Next - encodingMatrix.Add(encodingRow) - Next - End Sub - - Public Function Encode(inputTokens As List(Of String)) As List(Of List(Of Double)) - Dim encodedInputs As List(Of List(Of Double)) = New List(Of List(Of Double))() - - For Each token As String In inputTokens - Dim tokenEncoding As List(Of Double) = New List(Of Double)() - - ' Find the index of the token in the vocabulary - ' For simplicity, let's assume a fixed vocabulary - Dim tokenIndex As Integer = GetTokenIndex(token) - - ' Retrieve the positional encoding for the token - If tokenIndex >= 0 Then - tokenEncoding = encodingMatrix(tokenIndex) - Else - ' Handle unknown tokens if necessary - End If - - encodedInputs.Add(tokenEncoding) - Next - - Return encodedInputs - End Function - - Private Function GetTokenIndex(token As String) As Integer - ' Retrieve the index of the token in the vocabulary - ' For simplicity, let's assume a fixed vocabulary - Dim vocabulary As List(Of String) = GetVocabulary(InternalVocab) - Return vocabulary.IndexOf(token) - End Function - - Private Function GetVocabulary(ByRef Vocab As Corpus.Vocabulary) As List(Of String) - ' Return the vocabulary list - ' Modify this function as per your specific vocabulary - Dim Lst As New List(Of String) - For Each item In Vocab.Current - Lst.Add(item.Text) - Next - Return Lst - End Function - - End Class - - Public Class PositionalDecoder - Private ReadOnly decodingMatrix As List(Of List(Of Double)) - Private InternalVocab As Corpus.Vocabulary - - Public Sub New(maxLength As Integer, embeddingSize As Integer, ByRef Vocab As Corpus.Vocabulary) - decodingMatrix = New List(Of List(Of Double))() - InternalVocab = Vocab - ' Create the decoding matrix - For pos As Integer = 0 To maxLength - 1 - Dim decodingRow As List(Of Double) = New List(Of Double)() - - For i As Integer = 0 To embeddingSize - 1 - Dim angle As Double = pos / Math.Pow(10000, (2 * i) / embeddingSize) - decodingRow.Add(Math.Sin(angle)) - decodingRow.Add(Math.Cos(angle)) - Next - - decodingMatrix.Add(decodingRow) - Next - End Sub - - Public Function Decode(encodedInputs As List(Of List(Of Double))) As List(Of String) - Dim decodedTokens As List(Of String) = New List(Of String)() - - For Each encoding As List(Of Double) In encodedInputs - ' Retrieve the token index based on the encoding - Dim tokenIndex As Integer = GetTokenIndex(encoding) - - ' Retrieve the token based on the index - If tokenIndex >= 0 Then - Dim token As String = GetToken(tokenIndex) - decodedTokens.Add(token) - Else - ' Handle unknown encodings if necessary - End If - Next - - Return decodedTokens - End Function - - Private Function GetTokenIndex(encoding As List(Of Double)) As Integer - ' Retrieve the index of the token based on the encoding - ' For simplicity, let's assume a fixed vocabulary - Dim vocabulary As List(Of String) = GetVocabulary(InternalVocab) - - For i As Integer = 0 To decodingMatrix.Count - 1 - If encoding.SequenceEqual(decodingMatrix(i)) Then - Return i - End If - Next - - Return -1 ' Token not found - End Function - - Private Function GetToken(tokenIndex As Integer) As String - ' Retrieve the token based on the index - ' For simplicity, let's assume a fixed vocabulary - Dim vocabulary As List(Of String) = GetVocabulary(InternalVocab) - - If tokenIndex >= 0 AndAlso tokenIndex < vocabulary.Count Then - Return vocabulary(tokenIndex) - Else - Return "Unknown" ' Unknown token - End If - End Function - - Private Function GetVocabulary(ByRef Vocab As Corpus.Vocabulary) As List(Of String) - ' Return the vocabulary list - ' Modify this function as per your specific vocabulary - Dim Lst As New List(Of String) - For Each item In Vocab.Current - Lst.Add(item.Text) - Next - Return Lst - End Function - - End Class - - End Class + Public Structure WordVector Dim Freq As Integer Public NormalizedEncoding As Integer @@ -2243,8 +454,9 @@ Namespace LanguageModels End Structure Namespace BaseModels - + Public Class LanguageModelFactory + Public Class LangModelGenerator Inherits NgramLanguageModel @@ -2363,7 +575,9 @@ Namespace LanguageModels Throw New ArgumentException("Invalid n-gram size: " & n) End Select End Function + Public Class NgramModels + Public Class BaseModels Public Class BigramLanguageModel Inherits NgramLanguageModel @@ -3117,7 +1331,9 @@ Namespace LanguageModels End Class End Class + Public Class TestModels + Public Class CodeGenerator Inherits NgramGenerator @@ -3205,6 +1421,7 @@ Namespace LanguageModels End Class + Public Class AttentionNgramLanguageModel @@ -3279,6 +1496,7 @@ Namespace LanguageModels End Class End Class + Public Class NgramLanguageModel Public ngramEncodings As Dictionary(Of String, Integer) @@ -3697,8 +1915,9 @@ Namespace LanguageModels End Class - + Public Class NgramFunctions + Public Class NgramTrainer Inherits NgramLanguageModel ''' @@ -3822,6 +2041,7 @@ Namespace LanguageModels End Class + Public Class NgramGenerator Inherits NgramLanguageModel ''' @@ -3938,6 +2158,7 @@ Namespace LanguageModels End Class + Public Class NGramScorer Inherits NgramLanguageModel ''' @@ -4043,16 +2264,9 @@ Namespace LanguageModels ''' known entitys and topics in the model; ''' A known list of Entitys and Topics are required to create this model ''' This language model is ideally suited for NER / and other corpus interogations - ''' ''' - ''' - ''' Corpus Language Model - ''' Used to HoldDocuments : a corpus of documents Calculating detecting the - ''' known entitys and topics in the model; - ''' A known list of Entitys and Topics are required to create this model - ''' This language model is ideally suited for NER / and other corpus interogations ''' - ''' + Public Class Corpus ''' ''' Serializes object to json @@ -4299,6 +2513,7 @@ Namespace LanguageModels ''' ''' Represents an individual document in the corpus. It contains properties such as word count, processed text, sentences, topics, etc. ''' + Public Structure Document Public ReadOnly Property WordCount As Integer @@ -4621,6 +2836,7 @@ Namespace LanguageModels End Structure End Structure + Public Structure Entity Public Property EndIndex As Integer Public Property StartIndex As Integer @@ -4642,6 +2858,7 @@ Namespace LanguageModels ''' ''' NER Data held(known) by the corpus ''' + Public Class Recognition_Data Public Entitys As List(Of Entity) Public Topics As List(Of Topic) @@ -4652,7 +2869,7 @@ Namespace LanguageModels End Sub End Class - + Public Structure Term Public DocNumber As List(Of Integer) @@ -4676,6 +2893,7 @@ Namespace LanguageModels ''' ''' Represents a topic detected in the text. It has properties for the keyword and topic itself. ''' + Public Structure Topic Public Keyword As String Public Topic As String @@ -4812,6 +3030,7 @@ Namespace LanguageModels ''' (a record of words which can be looked up in the corpus) ''' It includes methods for adding new terms, calculating frequencies, TF-IDF, etc. ''' + Public Class Vocabulary Public Current As List(Of VocabularyEntry) @@ -4920,11 +3139,12 @@ Namespace LanguageModels ''' ''' Feature context is a way to add information with regards to the document, - ''' Addind context elements such as features. + ''' Adding context elements such as features. ''' Given a Sentiment (positive) , by marking the words in this document ''' present against the corpus vocabulary, it could be suggested that these would ''' represent that topic in this document ''' + Public Structure FeatureContext ''' @@ -4970,67 +3190,8 @@ Namespace LanguageModels End Structure - Public Shared Sub Main() - 'Create Vocabulary - Dim iCorpus As String = "the quick brown fox, jumped over the lazy dog." - Dim NewVocabulary = Vocabulary.CreateVocabulary(iCorpus, Vocabulary.VocabularyType.Word) - Console.WriteLine("vocabulary List: ") - Dim str As String = "" - For Each item In NewVocabulary - str &= "entry :" & item.Text & vbTab & "Value :" & item.Encoding & vbNewLine - - Next - Console.WriteLine(str) - 'Encode InputText - Dim InputText As String = iCorpus - - Dim InputLayer As New InputTextRecord - InputLayer.Text = iCorpus - Console.WriteLine("Input layer: ") - InputLayer.Encoding = Encode.Encode_Text(InputText, NewVocabulary, VocabularyType.Word) - Console.WriteLine("Input Text: " & "[" & InputLayer.Text & "]" & vbNewLine) - Console.WriteLine("Input Embedding: ") - str = "[" - For Each item In InputLayer.Encoding - str &= item & " " - Next - str &= "] " - Console.WriteLine(str) - Console.WriteLine(vbNewLine) - 'get inputs - InputLayer.blocksize = 4 - InputLayer.Inputblocks = InputTextRecord.GetBlocks(InputLayer.Encoding, InputLayer.blocksize) - Console.WriteLine("Input BlockSize: " & InputLayer.blocksize) - Console.WriteLine("Input Blocks ") - For Each lst In InputLayer.Inputblocks - - Dim block As String = "" - For Each item In lst - block &= item & " " - Next - Console.WriteLine("[" & block & "]") - Next - Console.WriteLine(vbNewLine) - Dim ofset = 1 - 'get targets(add ofset to get targets further in the future ofset < blocksize) - - InputLayer.Targetblocks = InputTextRecord.GetTargetBlocks(InputLayer.Encoding, InputLayer.blocksize) - - Console.WriteLine("Target BlockSize: " & InputLayer.blocksize) - Console.WriteLine("Target ofset : " & ofset) - Console.WriteLine("Target Blocks ") - For Each lst In InputLayer.Targetblocks - - Dim block As String = "" - For Each item In lst - block &= item & " " - Next - Console.WriteLine("[" & block & "]") - Next - Console.ReadLine() - - End Sub + Public Structure InputTextRecord Public Text As String Public Encoding As List(Of Integer) @@ -5206,7 +3367,7 @@ Namespace LanguageModels End Function End Class - + Public Class VocabularyEntry Public Text As String Public Encoding As Integer @@ -5335,6 +3496,7 @@ Namespace LanguageModels ''' by summing thier presence (one hot encoding) (Scoring) ''' this has (simple Bag of words) and (Complex(ORganized) Bag of Words) usage ''' + Public Class BagOfWords @@ -5978,11 +4140,12 @@ Namespace LanguageModels ''' An Encoded Language model , ''' With an internal vocabulary for Basic Encoding/Decoding ''' + Public Class iLangModel Private LanguageModel As BaseModels.LanguageModelFactory.NgramModels.NgramLanguageModel Private Attend As FeedForwardNetwork Private csize As Integer - Public Structure Vocabulary + Private Structure Vocabulary Private iValues As List(Of Token) Public ReadOnly Property Values As List(Of Token) Get @@ -6039,7 +4202,7 @@ Namespace LanguageModels End Function End Structure Private iVocabulary As Vocabulary - Public ReadOnly Property EncodingVocabulary As Vocabulary + Private ReadOnly Property EncodingVocabulary As Vocabulary Get Return iVocabulary End Get @@ -6058,9 +4221,9 @@ Namespace LanguageModels ''' ''' Can be set with a known vocabulary ''' - ''' - Public Sub New(iVocabulary As Vocabulary) - Me.iVocabulary = iVocabulary + ''' + Public Sub New(model As iLangModel) + Me.iVocabulary = model.EncodingVocabulary End Sub ''' ''' This input is encoded as a single value, @@ -6366,7 +4529,7 @@ Namespace LanguageModels End Class - Private Class FeedForwardNetwork + Public Class FeedForwardNetwork Public Enum Activation ReLU Sigmoid @@ -6559,71 +4722,7 @@ Namespace LanguageModels Return input End Function - Private Shared Function TrainTest() As FeedForwardNetwork - ' Create the input and target training data - Dim inputs As New List(Of List(Of Double))() - Dim targets As New List(Of List(Of Double))() - - ' AND logic gate training data - inputs.Add(New List(Of Double)() From {0, 0}) - inputs.Add(New List(Of Double)() From {0, 1}) - inputs.Add(New List(Of Double)() From {1, 0}) - inputs.Add(New List(Of Double)() From {1, 1}) - - targets.Add(New List(Of Double)() From {0}) - targets.Add(New List(Of Double)() From {0}) - targets.Add(New List(Of Double)() From {0}) - targets.Add(New List(Of Double)() From {1}) - - ' Create a feed-forward neural network with 2 input neurons, 2 hidden neurons, and 1 output neuron - Dim network As New FeedForwardNetwork(inputSize:=2, hiddenSize:=2, outputSize:=1) - - ' Train the network using the training data for 100 epochs with a learning rate of 0.1 - network.Train(inputs, targets, epochs:=100, learningRate:=0.1) - - ' Test the trained network - Console.WriteLine("Testing the trained network:") - - For i As Integer = 0 To inputs.Count - 1 - Dim inputVector As List(Of Double) = inputs(i) - Dim targetVector As List(Of Double) = targets(i) - - Dim outputVector = network.Forward(inputs) - - Console.WriteLine("Input: {0}, Target: {1}, Output: {2}", String.Join(", ", inputVector), String.Join(", ", targetVector), String.Join(", ", outputVector)) - Next - - Return network - End Function - Public Shared Sub Main() - ' Create an instance of the FeedForwardNetwork - Dim feedForwardNN As FeedForwardNetwork = TrainTest() - - ' Define the input sequence for the logical AND operation - Dim inputSequence As List(Of List(Of Double)) = New List(Of List(Of Double))() From - { - New List(Of Double)() From {0, 0}, - New List(Of Double)() From {0, 1}, - New List(Of Double)() From {1, 0}, - New List(Of Double)() From {1, 1} - } - - ' Apply the forward pass to get the predicted outputs - Dim output As List(Of List(Of Double)) = feedForwardNN.Forward(inputSequence) - - ' Display the input sequence and predicted outputs - Console.WriteLine("Input Sequence:") - For Each inputVector As List(Of Double) In inputSequence - Console.WriteLine(String.Join(", ", inputVector)) - Next - - Console.WriteLine("Predicted Outputs:") - For Each outputVector As List(Of Double) In output - Console.WriteLine(Math.Round(outputVector(0))) ' Round the output to the nearest integer (0 or 1) - Next - Console.ReadLine() - End Sub Public Function Forward(inputs As List(Of List(Of Double))) As List(Of List(Of Double)) Dim hiddenOutputs As List(Of List(Of Double)) = LinearTransformation(inputs, hiddenWeights, hiddenSize) @@ -7253,6 +5352,7 @@ Namespace LanguageModels Return Vocabulary End Function End Class + Public Class PositionalEncoding Private ReadOnly encodingMatrix As List(Of List(Of Double)) Private Vocabulary As New List(Of String) @@ -7325,6 +5425,7 @@ Namespace LanguageModels End Function End Class End Class + Public Class PredictiveLanguageModel Public ReadOnly Model As NgramLanguageModel Private Vocab As Corpus.Vocabulary diff --git a/SourceCode/Storage.vb b/SourceCode/Storage.vb index c46e04a..1d6b17b 100644 --- a/SourceCode/Storage.vb +++ b/SourceCode/Storage.vb @@ -4,6 +4,7 @@ Imports MathNet.Numerics Namespace Models Namespace Embeddings Namespace Storage + Public Class SearchEngine Private documents As List(Of Document) @@ -42,6 +43,7 @@ Namespace Models Return snippet End Function End Class + Public Class Document Public Property Index As Integer Public Property Content As String @@ -65,7 +67,7 @@ Namespace Models End Sub End Class Namespace MinHashAndLSH - + Public Class LSHIndex Private HashTables As List(Of Dictionary(Of Integer, List(Of Document))) Private NumHashTables As Integer @@ -184,7 +186,7 @@ Namespace Models End Function End Class - + Public Class MinHashVectorDatabase Private MinHashIndex As MinHashIndex @@ -203,6 +205,7 @@ Namespace Models Return similarDocuments End Function End Class + Public Class MinHashIndex Private NumHashFunctions As Integer Private SignatureMatrix As List(Of List(Of Integer)) @@ -289,6 +292,7 @@ Namespace Models End Function End Class End Namespace + Public Class VectorStorageModel Private AudioVectors As Dictionary(Of Integer, List(Of Complex)) = New Dictionary(Of Integer, List(Of Complex))() Private ImageVectors As Dictionary(Of Integer, Tuple(Of VectorType, List(Of Double))) = New Dictionary(Of Integer, Tuple(Of VectorType, List(Of Double)))() diff --git a/SourceCode/Tokenizers.vb b/SourceCode/Tokenizers.vb new file mode 100644 index 0000000..813e9e5 --- /dev/null +++ b/SourceCode/Tokenizers.vb @@ -0,0 +1,1378 @@ + + +Imports System.Runtime.CompilerServices +Imports System.Runtime.Serialization.Formatters.Binary +Imports System.Windows.Forms + +Namespace Models + Namespace TokenizerModels + + Public Class Token + + ''' + ''' Initializes a new instance of the Token structure. + ''' + ''' The type of the token. + ''' The string value of the token. + Public Sub New(ByVal type As String, ByVal value As String) + Me.Type = type + Me.Value = value + End Sub + + Public Sub New(ByVal type As TokenType, ByVal value As String, ByVal startPosition As Integer, ByVal endPosition As Integer) + Me.Type = type + Me.Value = value + Me.StartPosition = startPosition + Me.EndPosition = endPosition + End Sub + + Public Property EndPosition As Integer + Public Property StartPosition As Integer + Public Property Type As TokenType + Public Property Value As String + + Private iStopWords As List(Of String) + + Private Function RemoveStopWords(ByVal tokens As List(Of Token)) As List(Of Token) + Return tokens.Where(Function(token) Not StopWords.Contains(token.Value)).ToList() + End Function + Public Property StopWordRemovalEnabled As Boolean + + Public Property StopWords As List(Of String) + Get + Return iStopWords + End Get + Set(value As List(Of String)) + iStopWords = value + End Set + End Property + End Class + Public Module TokenizerExtensions + + Public Function ModelImporter(ByRef Filename As String) As Object + Dim FileStream As New System.IO.FileStream(Filename, System.IO.FileMode.Open) + Dim Formatter As New BinaryFormatter + Dim Model As Object = Formatter.Deserialize(FileStream) + FileStream.Close() + + Return Model + End Function + + Public Sub ModelExporter(ByRef Model As Object, Filename As String) + Dim path As String = Application.StartupPath + + Dim FileStream As New System.IO.FileStream(Filename, System.IO.FileMode.CreateNew) + Dim Formatter As New BinaryFormatter + Formatter.Serialize(Model, FileStream) + FileStream.Close() + + + End Sub + + + Public Enum TokenizerType + _Char + _Word + _Sentence + _Paragraph + _BPE + _Wordpiece + _Token + _TokenID + End Enum + Public Enum TokenType + GramaticalPunctuation + EncapuslationPunctuationStart + EncapuslationPunctuationEnd + MoneyPunctuation + MathPunctuation + CodePunctuation + AlphaBet + Number + Symbol + SeperatorPunctuation + Ignore + Word + Sentence + Character + Ngram + WordGram + SentenceGram + BitWord + Punctuation + whitespace + End Enum + + + Public Class PunctuationMarkers + Public Shared ReadOnly SeperatorPunctuation() As String = {" ", ",", "|"} + Public Shared ReadOnly Symbols() As String = {"@", "#", "$", "%", "&", "*", "+", "=", "^", "_", "~", "§", "°", "¿", "¡"} + Public Shared ReadOnly EncapuslationPunctuationEnd() As String = {"}", "]", ">", ")"} + Public Shared ReadOnly EncapuslationPunctuationStart() As String = {"{", "[", "<", "("} + Public Shared ReadOnly GramaticalPunctuation() As String = {".", "?", "!", ":", ";", ","} + Public Shared ReadOnly MathPunctuation = New String() {"+", "-", "*", "/", "=", "<", ">", "≤", "≥", "±", "≈", "≠", "%", "‰", "‱", "^", "_", "√", "∛", "∜", "∫", "∬", "∭", "∮", "∯", "∰", "∇", "∂", "∆", "∏", "∑", "∐", "⨀", "⨁", "⨂", "⨃", "⨄", "∫", "∬", "∭", "∮", "∯", "∰", "∇", "∂", "∆", "∏", "∑", "∐", "⨀", "⨁", "⨂", "⨃", "⨄"} + Public Shared ReadOnly MoneyPunctuation() As String = {"$", "€", "£", "¥", "₹", "₽", "₿"} + Public Shared ReadOnly CodePunctuation() As String = {"\", "#", "@", "^"} + + Public Shared ReadOnly Delimiters() As Char = {CType(" ", Char), CType(".", Char), + CType(",", Char), CType("?", Char), + CType("!", Char), CType(";", Char), + CType(":", Char), Chr(10), Chr(13), vbTab} + + Public ReadOnly Property SentenceEndPunctuation As List(Of String) + Get + Dim markers() As String = {".", ";", ":", "!", "?"} + Return markers.ToList + End Get + End Property + + Public Shared ReadOnly Property Punctuation As List(Of String) + Get + Dim x As New List(Of String) + x.AddRange(SeperatorPunctuation) + x.AddRange(Symbols) + x.AddRange(EncapuslationPunctuationStart) + x.AddRange(EncapuslationPunctuationEnd) + x.AddRange(MoneyPunctuation) + x.AddRange(MathPunctuation) + x.AddRange(GramaticalPunctuation) + x.AddRange(CodePunctuation) + Return x.Distinct.ToList + End Get + End Property + + End Class + + Public Function SpaceItems(ByRef txt As String, Item As String) As String + Return txt.Replace(Item, " " & Item & " ") + End Function + Public Class VocabularyPruner + Public Sub New(maxVocab As Integer, vocabulary As Dictionary(Of String, Integer), lowestVocabularyFreq As Integer) + If vocabulary Is Nothing Then + Throw New ArgumentNullException(NameOf(vocabulary)) + End If + + Me.MaxVocab = maxVocab + Me.Vocabulary = vocabulary + Me.LowestVocabularyFreq = lowestVocabularyFreq + End Sub + + ''' + ''' Defines max entries in vocabulary before Pruning Rare Words + ''' + ''' + Public Property MaxVocab As Integer = 100000 + Public Property Vocabulary As New Dictionary(Of String, Integer) + Public Property LowestVocabularyFreq As Integer = 1 + Public Function Prune() As Dictionary(Of String, Integer) + + + If Vocabulary.Count > MaxVocab Then + PruneVocabulary() + End If + Return Vocabulary + End Function + + Private Sub PruneVocabulary() + ' Create a list to store tokens to be removed. + Dim tokensToRemove As New List(Of String) + + ' Iterate through the vocabulary and identify tokens to prune. + For Each token In Vocabulary + Dim tokenId As Integer = token.Value + Dim tokenFrequency As Integer = Vocabulary(token.Key) + + ' Prune the token if it has frequency below the threshold (1) and is not recent (has a lower ID). + If tokenFrequency <= LowestVocabularyFreq AndAlso tokenId < Vocabulary.Count - 1 Then + tokensToRemove.Add(token.Key) + End If + Next + + ' Remove the identified tokens from the vocabulary. + For Each tokenToRemove In tokensToRemove + Vocabulary.Remove(tokenToRemove) + Next + + Console.WriteLine("Pruning completed. Vocabulary size after pruning: " & Vocabulary.Count) + Console.ReadLine() + End Sub + End Class + + Public Function UpdateVocabulary(vocabulary As Dictionary(Of String, Integer), Term As String) As Dictionary(Of String, Integer) + If vocabulary(Term) > 0 Then + Dim Freq As Integer = vocabulary(Term) + Freq += 1 + vocabulary.Remove(Term) + vocabulary.Add(Term, Freq) + Else + vocabulary.Add(Term, 1) + End If + Return vocabulary + End Function + + Public Function GetHighFreqLst(ByRef Vocabulary As Dictionary(Of String, Integer), ByRef Threshold As Integer) As List(Of String) + Dim HighFreq As New List(Of String) + For Each item In Vocabulary + If item.Value > Threshold Then + HighFreq.Add(item.Key) + End If + Next + Return HighFreq + End Function + Public Function FindFrequentCharacterBigrams(Vocab As List(Of String), ByRef Freq_Threshold As Integer) As List(Of String) + Dim bigramCounts As New Dictionary(Of String, Integer) + + For Each word In Vocab + Dim characters As Char() = word.ToCharArray() + + For i As Integer = 0 To characters.Length - 2 + Dim bigram As String = characters(i) & characters(i + 1) + + If bigramCounts.ContainsKey(bigram) Then + bigramCounts(bigram) += 1 + Else + bigramCounts.Add(bigram, 1) + End If + Next + Next + + Dim frequentCharacterBigrams As New List(Of String) + + For Each pair In bigramCounts + If pair.Value > Freq_Threshold Then ' Adjust the threshold as needed + frequentCharacterBigrams.Add(pair.Key) + End If + Next + + Return frequentCharacterBigrams + End Function + + Public Function UpdateCorpusWithMergedToken(ByRef corpus As List(Of String), pair As String) As List(Of String) + ' Update the text corpus with the merged token for the next iteration. + Return corpus.ConvertAll(Function(text) text.Replace(pair, pair.Replace(" ", "_"))) + End Function + + + + + Public Function SpacePunctuation(ByRef Txt As String) As String + For Each item In PunctuationMarkers.Punctuation + Txt = SpaceItems(Txt, item) + Next + + Return Txt + End Function + + + + Public Function ExtractStringBetween(ByVal value As String, ByVal strStart As String, ByVal strEnd As String) As String + If Not String.IsNullOrEmpty(value) Then + Dim i As Integer = value.IndexOf(strStart) + Dim j As Integer = value.IndexOf(strEnd) + Return value.Substring(i, j - i) + Else + Return value + End If + End Function + End Module + + Public MustInherit Class Tokenizer + + + Public Shared Function TokenizeToCharacter(Document As String) As List(Of String) + Document = Document.ToLower() + Dim characters As Char() = Document.ToCharArray() + TokenizeToCharacter = New List(Of String) + For Each item In characters + TokenizeToCharacter.Add(item) + Next + End Function + + Public Shared Function TokenizeToWord(Document As String) As List(Of String) + Document = Document.ToLower() + Document = Document.SpacePunctuation + Return Document.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries).ToList + End Function + + Public Shared Function TokenizeToSentence(Document As String) As List(Of String) + Document = Document.ToLower() + Document = Document.SpacePunctuation + Return Split(Document, ".").ToList + End Function + + Public Shared Function TokenizeToParagraph(Document As String) As List(Of String) + Document = Document.ToLower() + + Return Split(Document, vbNewLine).ToList + End Function + Public Shared Function CharGram(Document As String, n As Integer) As List(Of String) + CharGram = New List(Of String) + Document = Document.ToLower() + Document = Document.SpacePunctuation + + ' Generate character n-grams + For i As Integer = 0 To Document.Length - n + Dim ngram As String = Document.Substring(i, n) + CharGram.Add(ngram) + Next + + End Function + + Public Shared Function WordGram(ByRef text As String, n As Integer) As List(Of String) + WordGram = New List(Of String) + text = text.ToLower() + text = text.SpacePunctuation + + ' Split the clean text into individual words + Dim words() As String = text.Split({" ", ".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) + + ' Generate n-grams from the words + For i As Integer = 0 To words.Length - n + Dim ngram As String = String.Join(" ", words.Skip(i).Take(n)) + WordGram.Add(ngram) + Next + + End Function + + Public Shared Function ParagraphGram(text As String, n As Integer) As List(Of String) + ParagraphGram = New List(Of String) + + ' Split the text into paragraphs + Dim paragraphs() As String = text.Split({Environment.NewLine & Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries) + + ' Generate paragraph n-grams + For i As Integer = 0 To paragraphs.Length - n + Dim ngram As String = String.Join(Environment.NewLine & Environment.NewLine, paragraphs.Skip(i).Take(n)) + ParagraphGram.Add(ngram) + Next + + Return ParagraphGram + End Function + + Public Shared Function SentenceGram(text As String, n As Integer) As List(Of String) + Dim tokens As New List(Of String) + + ' Split the text into Clauses + Dim Clauses() As String = text.Split({".", ",", ";", ":", "!", "?"}, StringSplitOptions.RemoveEmptyEntries) + + ' Generate sentence n-grams + For i As Integer = 0 To Clauses.Length - n + Dim ngram As String = String.Join(" ", Clauses.Skip(i).Take(n)) + tokens.Add(ngram) + Next + + Return tokens + End Function + + + Public MustOverride Sub Train(ByRef Corpus As List(Of String)) + + Public Overridable Function Tokenize(text As String) As List(Of String) + Return TokenizeToWord(text) + End Function + + End Class + + Public Class WordPiece + Inherits Tokenizer + + Property vocabulary As Dictionary(Of String, Integer) + Public Property maxVocabSize As Integer + Public ReadOnly Property maxSubwordLength As Integer + + Public Sub New() + + Me.vocabulary = New Dictionary(Of String, Integer) + Me.maxVocabSize = 1000000 + Me.maxSubwordLength = 20 + End Sub + + Public Overrides Sub Train(ByRef corpus As List(Of String)) + Dim subwordCounts As New Dictionary(Of String, Integer) + + ' Count subword occurrences in the corpus + For Each sentence As String In corpus + Dim tokens As List(Of String) = Tokenize(sentence) + + For Each token As String In tokens + If subwordCounts.ContainsKey(token) Then + subwordCounts(token) += 1 + Else + subwordCounts.Add(token, 1) + End If + Next + Next + + ' Sort subwords by frequency and add them to the vocabulary + Dim sortedSubwords = subwordCounts.OrderByDescending(Function(pair) pair.Value) + + For Each pair In sortedSubwords.Take(maxVocabSize) + vocabulary.Add(pair.Key, vocabulary.Count) + Next + End Sub + Public Overrides Function Tokenize(text As String) As List(Of String) + Dim tokens As New List(Of String) + Dim index As Integer = 0 + + While index < text.Length + Dim subwordLength As Integer = Math.Min(maxSubwordLength, text.Length - index) + Dim subword As String = text.Substring(index, subwordLength) + + While subwordLength > 0 AndAlso Not vocabulary.ContainsKey(subword) + subwordLength -= 1 + subword = text.Substring(index, subwordLength) + End While + + tokens.Add(subword) + index += subwordLength + End While + + Return tokens + End Function + Public Shared Function CalculateWordPieceFrequency(ByVal subword As String, ByVal mergedWord As String) As Integer + Dim occurrences As Integer = 0 + Dim index As Integer = -1 + + While True + index = mergedWord.IndexOf(subword, index + 1) + If index = -1 Then + Exit While + End If + + ' Check if the found index is part of a valid word (not a subword of another word) + If index = 0 OrElse mergedWord(index - 1) = " "c Then + Dim endIndex As Integer = index + subword.Length + If endIndex = mergedWord.Length OrElse mergedWord(endIndex) = " "c Then + occurrences += 1 + End If + End If + End While + + Return occurrences + End Function + End Class + + Public Class BPE + Inherits Tokenizer + + Public Vocabulary As New Dictionary(Of String, Integer) + Public numMerges As Integer = 1 + Public Sub New() + End Sub + + Public Overrides Sub Train(ByRef Corpus As List(Of String)) + For Each item In Corpus + ' Tokenize the corpus at the character level to get the initial vocabulary + Dim characterLevelVocabulary As Dictionary(Of String, Integer) = TrainTokenize(item) + + ' Merge the most frequent pairs of subwords iteratively + For i As Integer = 0 To numMerges - 1 + Dim mostFrequentPair As SubWord_Pair = FindMostFrequentPair(characterLevelVocabulary) + If mostFrequentPair Is Nothing Then + Exit For + End If + + Dim newSubword As String = mostFrequentPair.Sub_word_1 + mostFrequentPair.Sub_Word_2 + characterLevelVocabulary = MergeSubwordPair(characterLevelVocabulary, mostFrequentPair, newSubword) + Next + For Each Entry In characterLevelVocabulary + + UpdateVocabulary(Vocabulary, Entry.Key) + Next + + + Next + + + + + End Sub + + Private Function TrainTokenize(Document As String) As Dictionary(Of String, Integer) + Dim characterLevelVocabulary As New Dictionary(Of String, Integer) + + + For Each character As Char In Document + Dim subword As String = character.ToString() + + If characterLevelVocabulary.ContainsKey(subword) Then + characterLevelVocabulary(subword) += 1 + Else + characterLevelVocabulary.Add(subword, 1) + End If + Next + + + Return characterLevelVocabulary + End Function + + Private Shared Function getTokenlist(characterLevelVocabulary As Dictionary(Of String, Integer)) As List(Of String) + Dim Tokens As New List(Of String) + For Each item In characterLevelVocabulary + Tokens.Add(item.Key) + Next + Return Tokens + End Function + + Private Shared Function FindMostFrequentPair(vocabulary As Dictionary(Of String, Integer)) As SubWord_Pair + Dim mostFrequentPair As SubWord_Pair = Nothing + Dim maxFrequency As Integer = 0 + + For Each subword1 As String In vocabulary.Keys + For Each subword2 As String In vocabulary.Keys + If subword1 <> subword2 Then + Dim pairFrequency As Integer = CalculatePairFrequency(vocabulary, subword1, subword2) + If pairFrequency > maxFrequency Then + maxFrequency = pairFrequency + mostFrequentPair = New SubWord_Pair(subword1, subword2, pairFrequency) + End If + End If + Next + Next + + Return mostFrequentPair + End Function + + Private Shared Function CalculatePairFrequency(vocabulary As Dictionary(Of String, Integer), subword1 As String, subword2 As String) As Integer + Dim pairFrequency As Integer = 0 + + For Each word As String In vocabulary.Keys + Dim mergedWord As String = word.Replace(subword1 + subword2, subword1 + subword2.ToLower()) + Dim occurrences As Integer = 0 + Dim index As Integer = -1 + + While True + index = mergedWord.IndexOf(subword1 + subword2.ToLower(), index + 1) + If index = -1 Then + Exit While + End If + occurrences += 1 + End While + + + pairFrequency += occurrences * vocabulary(word) + Next + + Return pairFrequency + End Function + + Private Shared Function MergeSubwordPair(vocabulary As Dictionary(Of String, Integer), pairToMerge As SubWord_Pair, newSubword As String) As Dictionary(Of String, Integer) + Dim newVocabulary As New Dictionary(Of String, Integer) + + For Each subword As String In vocabulary.Keys + Dim mergedSubword As String = subword.Replace(pairToMerge.Sub_word_1 + pairToMerge.Sub_Word_2, newSubword) + newVocabulary(mergedSubword) = vocabulary(subword) + Next + + Return newVocabulary + End Function + + Public Overrides Function Tokenize(Document As String) As List(Of String) + Dim characterLevelVocabulary As New Dictionary(Of String, Integer) + + + For Each character As Char In Document + Dim subword As String = character.ToString() + + If characterLevelVocabulary.ContainsKey(subword) Then + characterLevelVocabulary(subword) += 1 + Else + characterLevelVocabulary.Add(subword, 1) + End If + Next + + + Return getTokenlist(characterLevelVocabulary) + End Function + + Public Class SubWord_Pair + Public Property Sub_word_1 As String + Public Property Sub_Word_2 As String + Public Property Frequency As Integer + + Public Sub New(Sub_word_1 As String, Sub_Word_2 As String, frequency As Integer) + Me.Sub_word_1 = Sub_word_1 + Me.Sub_Word_2 = Sub_Word_2 + Me.Frequency = frequency + End Sub + End Class + End Class + + Public Class BitWord + Inherits Tokenizer + Public Property Vocabulary As Dictionary(Of String, Integer) + Public Property MaxMergeOperations As Integer + + Public Overrides Sub Train(ByRef Corpus As List(Of String)) + ' Initialize the vocabulary with word-level subword units + TokenizeCorpus(Corpus) + Dim mergeOperationsCount As Integer = 0 + + While mergeOperationsCount < MaxMergeOperations + ' Compute the frequency of subword units in the vocabulary + Dim subwordFrequencies As New Dictionary(Of String, Integer) + + For Each subword In Vocabulary.Keys + Dim subwordUnits = TokenizeToCharacter(subword) + For Each unit In subwordUnits + If subwordFrequencies.ContainsKey(unit) Then + subwordFrequencies(unit) += Vocabulary(subword) + Else + subwordFrequencies.Add(unit, Vocabulary(subword)) + End If + Next + Next + + ' Find the most frequent pair of subword units + Dim mostFrequentPair As KeyValuePair(Of String, Integer) = subwordFrequencies.OrderByDescending(Function(pair) pair.Value).FirstOrDefault() + + If mostFrequentPair.Value < 2 Then + ' Stop merging if the frequency of the most frequent pair is less than 2 + Exit While + End If + + ' Merge the most frequent pair into a new subword unit + Dim newSubwordUnit = mostFrequentPair.Key + + ' Update the vocabulary by replacing occurrences of the merged subword pair with the new subword unit + Dim updatedVocabulary As New Dictionary(Of String, Integer) + + For Each subword In Vocabulary.Keys + Dim mergedSubword = subword.Replace(mostFrequentPair.Key, newSubwordUnit) + updatedVocabulary(mergedSubword) = Vocabulary(subword) + Next + + Vocabulary = updatedVocabulary + mergeOperationsCount += 1 + + End While + End Sub + + Public Function TokenizeCorpus(Corpus As List(Of String)) As List(Of String) + Dim tokens As New List(Of String) + Dim Subword As String = "" + + Dim UnknownDocs As New List(Of String) + 'SubDoc Vocabulary Tokenizer + For Each doc In Corpus + For i = 0 To doc.Count - 1 + Subword &= doc(i) + If Vocabulary.ContainsKey(Subword.ToLower()) Then + tokens.Add(Subword) + Subword = "" + End If + + Next + 'Save unknowns + If Subword <> "" Then + UnknownDocs.Add(Subword) + End If + Next + 'Unknown paragraphs + Dim UnknownParagraphs As New List(Of String) + If UnknownDocs.Count > 0 Then + For Each doc In UnknownDocs + Dim Para As List(Of String) = TokenizeToParagraph(doc) + For Each item In Para + Subword = "" + + Subword += item + If Vocabulary.ContainsKey(Subword.ToLower) Then + ' If the subword is in the Vocabulary, add it to the list of subwords + tokens.Add(Subword.ToLower) + ' Reset the subword for the next iteration + Subword = "" + End If + 'Save unknowns + If Subword <> "" Then + UnknownParagraphs.Add(Subword) + End If + Next + + Next + End If + 'Unknown Sentences + Dim UnknownSents As New List(Of String) + If UnknownParagraphs.Count > 0 Then + For Each sent In UnknownParagraphs + Dim Sents As List(Of String) = TokenizeToSentence(sent) + + + For Each item In Sents + Subword = "" + + Subword += item + If Vocabulary.ContainsKey(Subword.ToLower) Then + ' If the subword is in the Vocabulary, add it to the list of subwords + tokens.Add(Subword.ToLower) + ' Reset the subword for the next iteration + Subword = "" + End If + 'Save unknowns + If Subword <> "" Then + UnknownSents.Add(Subword) + End If + Next + Next + End If + 'Unknown Words + Dim UnknownWords As New List(Of String) + If UnknownSents.Count > 0 Then + For Each Word In UnknownSents + Dim Words As List(Of String) = TokenizeToWord(Word) + For Each item In Words + Subword = "" + + Subword += item + If Vocabulary.ContainsKey(Subword.ToLower) Then + ' If the subword is in the Vocabulary, add it to the list of subwords + tokens.Add(Subword.ToLower) + ' Reset the subword for the next iteration + Subword = "" + End If + 'Save unknowns + If Subword <> "" Then + UnknownWords.Add(Subword) + End If + Next + + Next + + End If + 'Unknown Words + Dim UnknownChars As New List(Of String) + If UnknownWords.Count > 0 Then + For Each iChar In UnknownWords + Dim Chars As List(Of String) = TokenizeToCharacter(iChar) + For Each item In Chars + Subword = "" + + Subword += item + If Vocabulary.ContainsKey(Subword.ToLower) Then + ' If the subword is in the Vocabulary, add it to the list of subwords + tokens.Add(Subword.ToLower) + ' Reset the subword for the next iteration + Subword = "" + End If + 'Save unknowns + If Subword <> "" Then + UnknownChars.Add(Subword) + End If + Next + + Next + + End If + + For Each unkChar In UnknownChars + Vocabulary.Add(unkChar, 1) + Next + + Console.WriteLine("Recognized Tokens") + For Each tok In tokens + Console.WriteLine("Token =" & tok) + Next + + Console.WriteLine("UnRecognized Tokens") + For Each tok In UnknownChars + Console.WriteLine("Token =" & tok) + Next + Return tokens + End Function + + Public Overrides Function Tokenize(text As String) As List(Of String) + Throw New NotImplementedException() + End Function + End Class + + Public Class TokenID + Inherits TokenizerToTokens + Private nextId As Integer = 0 + Public Property Vocabulary As New Dictionary(Of String, Integer) + Public TokenToID As New Dictionary(Of String, Integer) + Private IDToToken As New Dictionary(Of Integer, String) + ''' + ''' Pure Tokenizer (will tokenize based on the Tokenizer model settings) + ''' + ''' + ''' + Public Shadows Function Encode(Doc As String) As List(Of Integer) + Dim tokens = TokenizeByWord(Doc) + Dim tokenIds As New List(Of Integer) + + For Each itoken In tokens + Dim tokenId As Integer + If TokenToID.ContainsKey(itoken.Value) Then + tokenId = TokenToID(itoken.Value) + Else + 'Not registered + + tokenId = TokenToID(itoken.Value) + + End If + tokenIds.Add(tokenId) + + Next + + Return tokenIds + End Function + Public Sub UpdateVocabulary(Token As String) + + If Not Vocabulary.ContainsKey(Token) Then + Vocabulary(Token) = nextId + nextId += 1 + TokenToID = Vocabulary.ToDictionary(Function(x) x.Key, Function(x) x.Value) + IDToToken = TokenToID.ToDictionary(Function(x) x.Value, Function(x) x.Key) + End If + + + End Sub + + ''' + ''' Given a Set of Token ID Decode the Tokens + ''' + ''' + ''' + Public Function Decode(tokenIds As List(Of Integer)) As String + Dim tokens As New List(Of String) + + For Each tokenId As Integer In tokenIds + tokens.Add(IDToToken(tokenId)) + Next + + Return String.Join(" ", tokens) + End Function + + + Public Overloads Function Tokenize(text As String) As List(Of String) + Dim lst As New List(Of String) + For Each item In Encode(text) + lst.Add(item) + Next + Return lst + End Function + End Class + + Public Class Advanced + Inherits Tokenizer + + Public Property Vocabulary As Dictionary(Of String, Integer) + Public ReadOnly Property PairFrequencies As Dictionary(Of String, Integer) = ComputePairFrequencies() + Public ReadOnly Property maxSubwordLen As Integer = Me.Vocabulary.Max(Function(token) token.Key.Length) + Private ReadOnly unkToken As String = "" + ''' + ''' Defines max entries in vocabulary before Pruning Rare Words + ''' + ''' + Public Property MaxVocabSize As Integer = 100000 + Public Sub Prune(pruningThreshold As Integer) + Dim Pruner As New VocabularyPruner(MaxVocabSize, Vocabulary, pruningThreshold) + + If Vocabulary.Count > MaxVocabSize Then + Vocabulary = Pruner.Prune() + End If + + End Sub + + + Public Overrides Sub Train(ByRef Corpus As List(Of String)) + For Each item In Corpus + Train(item, 10) + Next + End Sub + Public Overloads Sub Train(text As String, isWordPiece As Boolean, Epochs As Integer) + If isWordPiece Then + TrainWordPiece(text, Epochs) + Else + TrainBPE(text, Epochs) + End If + Prune(1) + End Sub + Public Overloads Sub Train(text As String, Epochs As Integer) + ' Tokenize the text into individual characters + + Dim Bits As List(Of String) = TokenizeBitWord(text) + For Each bit As String In Bits + UpdateVocabulary(bit) + Next + + + ' Train BPE using merging strategy + Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed + For mergeIndex As Integer = 0 To numMerges - 1 + MergeMostFrequentBigram() + MergeMostFrequentPair(FindMostFrequentPair.Key) + Next + + Prune(1) + End Sub + Private Sub TrainWordPiece(text As String, Epochs As Integer) + ' Tokenize the text into individual characters + Dim Bits As List(Of String) = TokenizeWordPiece(text) + For Each bit As String In Bits + UpdateVocabulary(bit) + Next + + ' Train WordPiece using merging strategy + Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed + For mergeIndex As Integer = 0 To numMerges - 1 + MergeMostFrequentBigram() + MergeMostFrequentPair(FindMostFrequentPair.Key) + Next + End Sub + Private Sub TrainBPE(text As String, Epochs As Integer) + ' Tokenize the text into individual characters + Dim Bits As List(Of String) = TokenizeBPE(text) + For Each bit As String In Bits + UpdateVocabulary(bit) + Next + + ' Train BPE using merging strategy + Dim numMerges As Integer = Epochs ' Define the number of merges, you can adjust it as needed + For mergeIndex As Integer = 0 To numMerges - 1 + MergeMostFrequentBigram() + MergeMostFrequentPair(FindMostFrequentPair.Key) + Next + End Sub + + Private Function TrainAndTokenize(singleDocument As String, isWordPiece As Boolean, Epochs As Integer) As List(Of String) + ' Tokenize the document using the current vocabulary. + Dim tokens As List(Of String) = If(isWordPiece, Tokenize(singleDocument, True), Tokenize(singleDocument, False)) + + ' Train the tokenizer using the same document. + If isWordPiece Then + TrainWordPiece(singleDocument, Epochs) + Else + TrainBPE(singleDocument, Epochs) + End If + + ' Re-tokenize the document with the updated vocabulary. + Return If(isWordPiece, TokenizeWordPiece(singleDocument), TokenizeBPE(singleDocument)) + End Function + + Public Overloads Function Tokenize(singleDocument As String, isWordPiece As Boolean) As List(Of String) + ' Tokenize the document using the current vocabulary. + Dim tokens As List(Of String) = If(isWordPiece, Tokenize(singleDocument, True), Tokenize(singleDocument, False)) + If tokens.Contains(unkToken) = True Then + tokens = TrainAndTokenize(singleDocument, isWordPiece, 1) + End If + Return tokens + End Function + Private Function TokenizeWordPiece(text As String) As List(Of String) + Dim tokens As New List(Of String) + Dim pos As Integer = 0 + + While pos < text.Length + Dim foundSubword As Boolean = False + Dim subword As String = "" + + For subwordLen As Integer = Math.Min(Me.maxSubwordLen, text.Length - pos) To 1 Step -1 + subword = text.Substring(pos, subwordLen) + + If Vocabulary.Keys.Contains(subword) Then + tokens.Add(subword) + pos += subwordLen + foundSubword = True + Exit For + End If + Next + + ' If no subword from the vocabulary matches, split into WordPiece tokens + If Not foundSubword Then + Dim wordPieceTokens As List(Of String) = TokenizeBitWord(subword) + tokens.AddRange(wordPieceTokens) + UpdateVocabulary(subword) + pos += subword.Length + End If + End While + + Return tokens + End Function + Private Function TokenizeBPE(ByVal text As String) As List(Of String) + Dim tokens As New List(Of String) + + While text.Length > 0 + Dim foundToken As Boolean = False + + ' Find the longest token in the vocabulary that matches the start of the text + For Each subword In Vocabulary.OrderByDescending(Function(x) x.Key.Length) + If text.StartsWith(subword.Key) Then + tokens.Add(subword.Key) + text = text.Substring(subword.Key.Length) + foundToken = True + Exit For + End If + Next + + ' If no token from the vocabulary matches, break the text into subwords + If Not foundToken Then + Dim subwordFound As Boolean = False + Dim subword As String = "" + ' Divide the text into subwords starting from the longest possible length + For length = Math.Min(text.Length, 20) To 1 Step -1 + subword = text.Substring(0, length) + + ' Check if the subword is in the vocabulary + If Vocabulary.Keys(subword) Then + tokens.Add(subword) + text = text.Substring(length) + subwordFound = True + Exit For + End If + Next + + ' If no subword from the vocabulary matches, + 'Learn On the fly, But + If Not subwordFound Then + ' Throw New Exception("Unrecognized subword in the text.") + tokens.AddRange(TokenizeBitWord(unkToken & subword)) + UpdateVocabulary(subword) + + End If + End If + End While + + Return tokens + End Function + + Private Function TokenizeBitWord(subword As String) As List(Of String) + Dim wordPieceTokens As New List(Of String) + Dim startIdx As Integer = 0 + + While startIdx < subword.Length + Dim endIdx As Integer = subword.Length + Dim foundSubword As Boolean = False + + While startIdx < endIdx + Dim candidate As String = subword.Substring(startIdx, endIdx - startIdx) + Dim isLast = endIdx = subword.Length + + If Vocabulary.Keys.Contains(candidate) OrElse isLast Then + wordPieceTokens.Add(candidate) + startIdx = endIdx + foundSubword = True + Exit While + End If + + endIdx -= 1 + End While + + ' If no subword from the vocabulary matches, break the subword into smaller parts + If Not foundSubword Then + wordPieceTokens.Add("") + startIdx += 1 + End If + End While + + Return wordPieceTokens + End Function + + Public Overrides Function Tokenize(text As String) As List(Of String) + Dim Words = Tokenizer.TokenizeToWord(text) + Dim Tokens As New List(Of String) + For Each item In Words + Tokens.AddRange(TokenizeBitWord(item)) + Next + Return Tokens + End Function + ''' + ''' Adds a VocabularyList to this vocabulary + ''' + ''' + Public Sub Add_Vocabulary(initialVocabulary As List(Of String)) + + For Each word In initialVocabulary + + UpdateVocabulary(word) + + Next + + End Sub + Public Sub Initialize_Vocabulary(initialVocabulary As List(Of String), n As Integer) + + For Each word In initialVocabulary + For i As Integer = 0 To word.Length - n + UpdateVocabulary(word.Substring(i, n)) + Next + Next + + End Sub + Private Function ComputePairFrequencies() As Dictionary(Of String, Integer) + Dim pairFrequencies As Dictionary(Of String, Integer) = New Dictionary(Of String, Integer) + + For Each token As String In Vocabulary.Keys + Dim tokenChars As List(Of Char) = token.ToList() + + For i As Integer = 0 To tokenChars.Count - 2 + Dim pair As String = tokenChars(i) & tokenChars(i + 1) + + If Not pairFrequencies.ContainsKey(pair) Then + pairFrequencies.Add(pair, Vocabulary(token)) + Else + Dim value = pairFrequencies(pair) + value += Vocabulary(token) + pairFrequencies.Remove(pair) + pairFrequencies.Add(pair, value) + + + End If + Next + Next + + Return pairFrequencies + End Function + + Private Sub UpdateFrequencyDictionary(mergedSubword As String) + PairFrequencies.Remove("") + For i As Integer = 0 To mergedSubword.Length - 2 + Dim bigram As String = mergedSubword.Substring(i, 2) + If PairFrequencies.ContainsKey(bigram) Then + PairFrequencies(bigram) += 1 + Else + PairFrequencies.Add(bigram, 1) + End If + Next + End Sub + Public Sub UpdateVocabulary(ByRef Term As String) + If Vocabulary.Keys.Contains(Term) = True Then + Dim value = Vocabulary(Term) + value += 1 + Vocabulary.Remove(Term) + Vocabulary.Add(Term, value) + Else + Vocabulary.Add(Term, 1) + End If + + End Sub + Public Function GetVocabularyLst() As List(Of String) + Return Vocabulary.Keys.ToList() + End Function + + + + Private Function FindMostFrequentPair() As KeyValuePair(Of String, Integer) + ' Find the most frequent character pair from the frequency counts. + Return PairFrequencies.Aggregate(Function(x, y) If(x.Value > y.Value, x, y)) + End Function + Private Sub MergeMostFrequentPair(pair As String) + ' Merge the most frequent character pair into a new subword unit. + Dim mergedToken As String = pair.Replace(" ", "_") + UpdateVocabulary(mergedToken) + + End Sub + Private Sub MergeMostFrequentBigram() + Dim mostFrequentBigram As String = GetMostFrequentBigram() + If mostFrequentBigram IsNot Nothing Then + Dim mergedSubword As String = mostFrequentBigram.Replace("", " ") + + UpdateVocabulary(mergedSubword) + + End If + End Sub + Private Function GetMostFrequentBigram() As String + Dim mostFrequentBigram As String = Nothing + Dim maxFrequency As Integer = 0 + + For Each bigram In PairFrequencies.Keys + If PairFrequencies(bigram) > maxFrequency Then + mostFrequentBigram = bigram + maxFrequency = PairFrequencies(bigram) + End If + Next + + Return mostFrequentBigram + End Function + + + End Class + + Public Class TokenizerToTokens + + + + + + ''' + ''' Pure basic Tokenizer to Tokens + ''' + ''' + ''' Type Of Tokenization + ''' + Public Function Tokenize(ByRef Corpus As List(Of String), tokenizationOption As TokenizerType) As List(Of Token) + Dim ivocabulary As New List(Of Token) + + For Each Doc In Corpus + Select Case tokenizationOption + Case TokenizerType._Char + ivocabulary.AddRange(TokenizeByCharacter(Doc.ToLower)) + Case TokenizerType._Word + ivocabulary.AddRange(TokenizeByWord(Doc.ToLower)) + Case TokenizerType._Sentence + ivocabulary.AddRange(TokenizeBySentence(Doc.ToLower)) + + + End Select + Next + + Return ivocabulary + End Function + Private Shared ReadOnly AlphaBet() As String = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", + "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", + "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"} + Private Shared ReadOnly Number() As String = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", +"30", "40", "50", "60", "70", "80", "90", "00", "000", "0000", "00000", "000000", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", +"nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million", "Billion"} + Public Shared Function GetValidTokens(ByRef InputStr As String) As String + Dim EndStr As Integer = InputStr.Length + Dim CharStr As String = "" + For i = 0 To EndStr - 1 + If GetTokenType(InputStr(i)) <> TokenType.Ignore Then + CharStr = AddSuffix(CharStr, InputStr(i)) + Else + + End If + Next + Return CharStr + End Function + Private Shared Function AddSuffix(ByRef Str As String, ByVal Suffix As String) As String + Return Str & Suffix + End Function + Public Function GetEncapsulated(ByRef Userinput As String) As List(Of String) + GetEncapsulated = New List(Of String) + Do Until ContainsEncapsulated(Userinput) = False + GetEncapsulated.Add(ExtractEncapsulated(Userinput)) + Loop + End Function + Public Function ExtractEncapsulated(ByRef Userinput As String) As String + ExtractEncapsulated = Userinput + If ContainsEncapsulated(ExtractEncapsulated) = True Then + If ExtractEncapsulated.Contains("(") = True And ExtractEncapsulated.Contains(")") = True Then + ExtractEncapsulated = ExtractEncapsulated.ExtractStringBetween("(", ")") + End If + If Userinput.Contains("[") = True And Userinput.Contains("]") = True Then + ExtractEncapsulated = ExtractEncapsulated.ExtractStringBetween("[", "]") + End If + If Userinput.Contains("{") = True And Userinput.Contains("}") = True Then + ExtractEncapsulated = ExtractEncapsulated.ExtractStringBetween("{", "}") + End If + If Userinput.Contains("<") = True And Userinput.Contains(">") = True Then + ExtractEncapsulated = ExtractEncapsulated.ExtractStringBetween("<", ">") + End If + End If + End Function + + Public Function ContainsEncapsulated(ByRef Userinput As String) As Boolean + Dim Start = False + Dim Ending = False + ContainsEncapsulated = False + For Each item In PunctuationMarkers.EncapuslationPunctuationStart + If Userinput.Contains(item) = True Then Start = True + Next + For Each item In PunctuationMarkers.EncapuslationPunctuationEnd + If Userinput.Contains(item) = True Then Ending = True + Next + If Start And Ending = True Then + ContainsEncapsulated = True + End If + End Function + + Public Shared Function GetTokenType(ByRef CharStr As String) As TokenType + For Each item In PunctuationMarkers.SeperatorPunctuation + If CharStr = item Then Return TokenType.SeperatorPunctuation + Next + For Each item In PunctuationMarkers.GramaticalPunctuation + If CharStr = item Then Return TokenType.GramaticalPunctuation + Next + For Each item In PunctuationMarkers.EncapuslationPunctuationStart + If CharStr = item Then Return TokenType.EncapuslationPunctuationStart + Next + For Each item In PunctuationMarkers.EncapuslationPunctuationEnd + If CharStr = item Then Return TokenType.EncapuslationPunctuationEnd + Next + For Each item In PunctuationMarkers.MoneyPunctuation + If CharStr = item Then Return TokenType.MoneyPunctuation + Next + For Each item In PunctuationMarkers.MathPunctuation + If CharStr = item Then Return TokenType.MathPunctuation + Next + For Each item In PunctuationMarkers.CodePunctuation + If CharStr = item Then Return TokenType.CodePunctuation + Next + For Each item In AlphaBet + If CharStr = item Then Return TokenType.AlphaBet + Next + For Each item In Number + If CharStr = item Then Return TokenType.Number + Next + Return TokenType.Ignore + End Function + + ''' + ''' Returns Tokens With Positions + ''' + ''' + ''' + Public Shared Function TokenizeByCharacter(ByVal input As String) As List(Of Token) + Dim characters As Char() = input.ToCharArray() + Dim tokens As New List(Of Token) + Dim currentPosition As Integer = 0 + + For Each character As Char In characters + Dim startPosition As Integer = currentPosition + Dim endPosition As Integer = currentPosition + Dim token As New Token(TokenType.Character, character.ToString(), startPosition, endPosition) + tokens.Add(token) + currentPosition += 1 + Next + + Return tokens + End Function + + ''' + ''' Returns Tokens With Positions + ''' + ''' + ''' + Public Shared Function TokenizeBySentence(ByVal input As String) As List(Of Token) + Dim sentences As String() = input.Split("."c) + Dim tokens As New List(Of Token) + Dim currentPosition As Integer = 0 + + For Each sentence As String In sentences + Dim startPosition As Integer = currentPosition + Dim endPosition As Integer = currentPosition + sentence.Length - 1 + Dim token As New Token(TokenType.Sentence, sentence, startPosition, endPosition) + tokens.Add(token) + currentPosition = endPosition + 2 ' Account for the period and the space after the sentence + Next + + Return tokens + End Function + + ''' + ''' Returns Tokens With Positions + ''' + ''' + ''' + Public Shared Function TokenizeByWord(ByVal input As String) As List(Of Token) + Dim words As String() = input.Split(" "c) + Dim tokens As New List(Of Token) + Dim currentPosition As Integer = 0 + + For Each word As String In words + Dim startPosition As Integer = currentPosition + Dim endPosition As Integer = currentPosition + word.Length - 1 + Dim token As New Token(TokenType.Word, word, startPosition, endPosition) + tokens.Add(token) + currentPosition = endPosition + 2 ' Account for the space between words + Next + + Return tokens + End Function + + End Class + End Namespace + + + +End Namespace \ No newline at end of file diff --git a/SourceCode/Trees.vb b/SourceCode/Trees.vb index 755d56c..3edf46f 100644 --- a/SourceCode/Trees.vb +++ b/SourceCode/Trees.vb @@ -19,6 +19,7 @@ Namespace Models ''' if this tree requires data to be stroed it needs to be stored inside the dataStorae locations ''' + Public Class TryTree #Region "Public Fields" @@ -841,7 +842,7 @@ Namespace Models End Class - + Public Class TrieTree Public Overridable Property root As Node @@ -892,7 +893,7 @@ Namespace Models End Function End Class - + Public Class FrequencyTrieTree Inherits TrieTree Public Shadows Property root As FrequencyNode @@ -908,7 +909,7 @@ Namespace Models End Function End Class - + Public Class VocabularyTrieTree Inherits TrieTree Private Shadows WithEvents Root As VocabularyNode @@ -951,7 +952,7 @@ Namespace Models End Class Namespace BeliefTree - + Public Class ConditionalProbabilityTable Public Property Node As BeliefNode Public Property Values As Dictionary(Of List(Of String), Double) @@ -965,6 +966,7 @@ Namespace Models Values(parentStates) = value End Sub End Class + Public Class InferenceEngine Public Sub New(network As BeliefNetwork) Me.Network = network @@ -1033,6 +1035,7 @@ Namespace Models End While End Function End Class + Public Class BeliefNetwork Public Property Nodes As List(Of BeliefNode) Public Sub LoadTrainingData(trainingData As Dictionary(Of String, Dictionary(Of List(Of String), Double))) @@ -1271,6 +1274,7 @@ Namespace Models ''' organized then the tree can be reorganized the rules are such that the lowest numbers are ''' always on the left and the highest numbers are on the right ''' + Public Class BinaryTree ''' @@ -1403,6 +1407,7 @@ Namespace Models ''' when if held in the node would cause repetitive information being held. ''' This also enables for the Vocabulary to be used as a custom Dictionary object ''' + Public Structure VocabItem ''' @@ -1457,6 +1462,7 @@ Namespace Models ''' ''' Returns a list WordGram Probability Given a Sequence of Tokens ''' + Public Class Predict ''' @@ -1621,6 +1627,7 @@ Namespace Models End Class Namespace Nodes + Public Class BeliefNode Public Property Name As String Public Property States As List(Of String) @@ -1633,6 +1640,7 @@ Namespace Models Parents = New List(Of BeliefNode) End Sub End Class + Public Class Node @@ -2280,6 +2288,7 @@ Namespace Models End Function End Class + Public Class BinaryNode Public Left As BinaryNode Public Right As BinaryNode @@ -2390,6 +2399,7 @@ Namespace Models End Sub End Class + Public Class FrequencyNode Inherits Node @@ -2495,6 +2505,7 @@ Namespace Models ''' ''' Used To Hold Risk Evaluation Data ''' + Public Structure RiskNode Private mCost As Integer @@ -2575,6 +2586,7 @@ Namespace Models End Property End Structure + Public Class VocabularyNode Inherits FrequencyNode diff --git a/SourceCode/Utilitys.vb b/SourceCode/Utilitys.vb index c51664a..64514c5 100644 --- a/SourceCode/Utilitys.vb +++ b/SourceCode/Utilitys.vb @@ -4,9 +4,10 @@ Imports System.Web.Script.Serialization Imports InputModelling.DataObjects Imports InputModelling.Models.Embeddings.Text Imports InputModelling.Models.EntityModel +Imports InputModelling.Models.TokenizerModels Namespace Utilitys - + Public Class iCompare Public Shared Function GetDistinctWords(text As String) As HashSet(Of String) @@ -107,147 +108,7 @@ Namespace Utilitys End Function End Class - - Public Class SentenceSplitter - Public Const ClassId As String = "28993390-7702-401C-BAB3-38FF97BC1AC9" - Public Const EventsId As String = "CD334307-F53E-401A-AC6D-3CFDD86FD6F1" - Public Const InterfaceId As String = "8B3345B1-5D13-4059-829B-B531310144B5" - - ''' - ''' punctuation markers for end of sentences(individual thoughts) Set in order of Rank - ''' - Public Shared EndPunctuation() As String = {".", ";", "?", "!", ":"} - - ''' - ''' Punctuation(known) - ''' - Public Shared Punctuation() As String = {".", ",", ";", "?", "!", ":", "$", "%", "^", "*", "<", ">", -"/", "@", "(", ")", "'""{", "}", "[", "]", "\", "|", "+", "=", "_", "-"} - - Private mSent As List(Of String) - - ''' - ''' Provide text for sentence definition, - ''' - ''' - Public Sub New(ByVal Text As String) - mSent = SplitTextToSentences(Text) - End Sub - - ''' - ''' Returns number of sentences found - ''' - ''' - Public ReadOnly Property Count As Integer - Get - For Each Sent As String In Sentences - Count += 1 - - Next - Return Count - End Get - End Property - - Public ReadOnly Property Sentences As List(Of String) - Get - Return mSent - End Get - End Property - - ''' - ''' Removes Trailing Spaces as well as double spaces from Text Also the Text is Capitalized - ''' - ''' - ''' - Public Shared Function FormatText(ByRef Text As String) As String - Dim FormatTextResponse As String = "" - 'FORMAT USERINPUT - 'turn to uppercase for searching the db - Text = LTrim(Text) - Text = RTrim(Text) - Text = Text.Replace(" ", " ") - FormatTextResponse = Text - Return FormatTextResponse - End Function - - ''' - ''' finds sentences in text or phrase. based on EndPunctuation markers - ''' - ''' - ''' Returns a list of sentences defined in the text - Public Shared Function GetSentences(ByRef InputStr As String) As List(Of String) - GetSentences = New List(Of String) - Dim s As New SentenceSplitter(InputStr) - For Each Sent As String In s.Sentences - GetSentences.Add(Sent) - Next - End Function - - ''' - ''' Removes Punctuation from Text - ''' - ''' - ''' Cleaned Text - Public Shared Function RemovePunctuation(ByVal Text As String) As String - Dim mText As String = Text - For Each item As String In Punctuation - mText = mText.Replace(item, " ") - Next - mText = mText.Replace(" ", " ") - Return mText - End Function - - ''' - ''' Splits Sentences by the Punctution offered. As it may be prudent to split by "." then - ''' after by "," for sub components of the sentence etc - ''' - ''' text to be examined - ''' Punctuation to be used as end marker - ''' - Public Shared Function SplitTextToSentences(ByVal mText As String, ByVal mEndPunctuation As String) As List(Of String) - - Dim Text As String = mText - - Text = Text.Replace(mEndPunctuation, "#") - - Dim TempSentencesArray() As String = Split(Text, "#") - Dim mSentences As New List(Of String) - For Each SentStr As String In TempSentencesArray - If SentStr <> "" Then - mSentences.Add(FormatText(SentStr)) - End If - - Next - - Return mSentences - End Function - - ''' - ''' Splits to sentences based on all end markers in EndPunctuation - ''' - ''' - ''' - Private Function SplitTextToSentences(ByVal mText As String) As List(Of String) - - Dim Text As String = mText - For Each item As String In EndPunctuation - Text = Text.Replace(item, "#") - - Next - Dim TempSentencesArray() As String = Split(Text, "#") - Dim mSentences As New List(Of String) - For Each SentStr As String In TempSentencesArray - If SentStr <> "" Then - mSentences.Add(FormatText(SentStr)) - End If - - Next - - Return mSentences - End Function - - End Class - + Public Class ICollect Public Shared FemaleNames As List(Of String) @@ -551,7 +412,7 @@ Namespace Utilitys Dim word As String = words(i) Dim wordWithContext As New WordWithContext() With { - .word = word, + .Word = word, .IsFocusTerm = (i = focusIndex), .IsPreceding = (i < focusIndex), .IsFollowing = (i > focusIndex) @@ -1530,6 +1391,7 @@ Namespace Utilitys Namespace NN + Public Class Perceptron Public Property Weights As Double() ' The weights of the perceptron @@ -2007,2531 +1869,2722 @@ Namespace Utilitys End Class End Namespace Namespace TEXT + Public Module TextExtensions - - Public Class Summarise - - Public Function GenerateSummary(ByRef Text As String, ByRef Entitys As List(Of String)) As String - ' Step 5: Generate the summary - Return String.Join(vbNewLine, ExtractImportantSentencesInText(Text, Entitys, True, 2)) + ''' + ''' Add full stop to end of String + ''' + ''' + ''' + + Public Function AddFullStop(ByRef MESSAGE As String) As String + AddFullStop = MESSAGE + If MESSAGE = "" Then Exit Function + MESSAGE = Trim(MESSAGE) + If MESSAGE Like "*." Then Exit Function + AddFullStop = MESSAGE + "." End Function - Public Function GenerateSummary(ByVal text As String, ByVal entities As List(Of String), ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As String - ' Extract important sentences with context - Dim importantSentences As List(Of String) = ExtractImportantSentencesInText(text, entities, numContextSentencesBefore, numContextSentencesAfter) - - ' Generate the summary - Dim summary As String = String.Join(". ", importantSentences) + ''' + ''' Adds string to end of string (no spaces) + ''' + ''' base string + ''' Add before (no spaces) + ''' + + Public Function AddPrefix(ByRef Str As String, ByVal Prefix As String) As String + Return Prefix & Str + End Function - Return summary + ''' + ''' Adds Suffix to String (No Spaces) + ''' + ''' Base string + ''' To be added After + ''' + + Public Function AddSuffix(ByRef Str As String, ByVal Suffix As String) As String + Return Str & Suffix End Function ''' - ''' Searches for important sentences in text , identified by the presence of an entity from this list - ''' These lists can be specific to a particular topic or entity or a search query + ''' GO THROUGH EACH CHARACTER AND ' IF PUNCTUATION IE .!?,:'"; REPLACE WITH A SPACE ' IF , + ''' OR . THEN CHECK IF BETWEEN TWO NUMBERS, IF IT IS ' THEN LEAVE IT, ELSE REPLACE IT WITH A + ''' SPACE ' ''' - ''' - ''' Entity list - ''' - ''' + ''' String to be formatted ''' - Public Function ExtractImportantSentencesInText(ByRef Text As String, - EntityList As List(Of String), - Optional WithContext As Boolean = False, - Optional NumberOfContextSentences As Integer = 0) As List(Of String) - Dim Sents As New List(Of String) + ''' + + Public Function AlphaNumericalOnly(ByRef STRINPUT As String) As String - Select Case WithContext - Case False + Dim A As Short + For A = 1 To Len(STRINPUT) + If Mid(STRINPUT, A, 1) = "." Or + Mid(STRINPUT, A, 1) = "!" Or + Mid(STRINPUT, A, 1) = "?" Or + Mid(STRINPUT, A, 1) = "," Or + Mid(STRINPUT, A, 1) = ":" Or + Mid(STRINPUT, A, 1) = "'" Or + Mid(STRINPUT, A, 1) = "[" Or + Mid(STRINPUT, A, 1) = """" Or + Mid(STRINPUT, A, 1) = ";" Then - For Each Sent In Split(Text, ".") - For Each Entity In EntityList - If Sent.Contains(Entity) Then - Sents.Add(Sent) + ' BEGIN CHECKING PERIODS AND COMMAS THAT ARE IN BETWEEN NUMBERS ' + If Mid(STRINPUT, A, 1) = "." Or Mid(STRINPUT, A, 1) = "," Then + If Not (A - 1 = 0 Or A = Len(STRINPUT)) Then + If Not (IsNumeric(Mid(STRINPUT, A - 1, 1)) Or IsNumeric(Mid(STRINPUT, A + 1, 1))) Then + STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) End If - Next + Else + STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) + End If + Else + STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) + End If - Next - Return Sents.Distinct.ToList - Case True + ' END CHECKING PERIODS AND COMMAS IN BETWEEN NUMBERS ' + End If + Next A + ' RETURN PUNCTUATION STRIPPED STRING ' + AlphaNumericalOnly = STRINPUT.Replace(" ", " ") + End Function - For Each Sent In Split(Text, ".") - For Each Entity In EntityList - If Sent.ToLower.Contains(Entity.ToLower) Then - Sents.AddRange(ExtractContextSentences(Text, Sent, NumberOfContextSentences)) + + Public Function AlphaNumericOnly(ByRef txt As String) As String + Dim NewText As String = "" + Dim IsLetter As Boolean = False + Dim IsNumerical As Boolean = False + For Each chr As Char In txt + IsNumerical = False + IsLetter = False + For Each item In AlphaBet + If IsLetter = False Then + If chr.ToString = item Then + IsLetter = True + Else + End If + End If + Next + 'Check Numerical + If IsLetter = False Then + For Each item In Numerical + If IsNumerical = False Then + If chr.ToString = item Then + IsNumerical = True + Else End If - Next - + End If Next - Return Sents.Distinct.ToList - End Select + Else + End If + If IsLetter = True Or IsNumerical = True Then + NewText &= chr.ToString + Else + NewText &= " " + End If + Next + NewText = NewText.Replace(" ", " ") + Return NewText + End Function - Return Sents.Distinct.ToList + 'Text + + Public Function Capitalise(ByRef MESSAGE As String) As String + Dim FirstLetter As String + Capitalise = "" + If MESSAGE = "" Then Exit Function + FirstLetter = Left(MESSAGE, 1) + FirstLetter = UCase(FirstLetter) + MESSAGE = Right(MESSAGE, Len(MESSAGE) - 1) + Capitalise = (FirstLetter + MESSAGE) End Function ''' - ''' grabs important sentences from text based on the entity list provided . - ''' (values or terms or noun phrases or verb phrases) as this is a sentence level search - ''' it also grabs the context sentences surrounding it based on the inputs + ''' Capitalizes the text ''' - ''' - ''' - ''' - ''' + ''' ''' - Public Function ExtractImportantSentencesInText(ByVal text As String, ByVal entityList As List(Of String), ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As List(Of String) - Dim importantSentences As New List(Of String) + + Public Function CapitaliseTEXT(ByVal MESSAGE As String) As String + Dim FirstLetter As String = "" + CapitaliseTEXT = "" + If MESSAGE = "" Then Exit Function + FirstLetter = Left(MESSAGE, 1) + FirstLetter = UCase(FirstLetter) + MESSAGE = Right(MESSAGE, Len(MESSAGE) - 1) + CapitaliseTEXT = (FirstLetter + MESSAGE) + End Function - For Each sentence In text.Split("."c) - For Each entity In entityList - If sentence.ToLower.Contains(entity.ToLower) Then - ' Add the current sentence and the context sentences - importantSentences.AddRange(ExtractContextSentences(text, sentence, numContextSentencesBefore, numContextSentencesAfter)) - Exit For ' Break out of the inner loop if the entity is found in the sentence + ''' + ''' Capitalise the first letter of each word / Tilte Case + ''' + ''' A string - paragraph or sentence + ''' String + + Public Function CapitalizeWords(ByVal words As String) + Dim output As System.Text.StringBuilder = New System.Text.StringBuilder() + Dim exploded = words.Split(" ") + If (exploded IsNot Nothing) Then + For Each word As String In exploded + If word IsNot Nothing Then + output.Append(word.Substring(0, 1).ToUpper).Append(word.Substring(1, word.Length - 1)).Append(" ") End If + Next - Next + End If + + Return output.ToString() - Return importantSentences.Distinct().ToList() End Function ''' - ''' Gets important Sentences in text with or without context + ''' A string extension method that query if this object contains the given value. ''' - ''' - ''' - ''' - ''' - ''' - ''' - Public Function ExtractImportantSentencesInText(ByRef Text As String, EntityList As List(Of String), Optional WithContext As Boolean = False, - Optional NumberOfContextSentencesBefore As Integer = 0, - Optional NumberOfContextSentencesAfter As Integer = 0) As List(Of String) - Dim importantSentences As New List(Of String) - - For Each sentence In Split(Text, ".") - For Each entity In EntityList - If sentence.ToLower.Contains(entity.ToLower) Then - importantSentences.Add(sentence) - Exit For ' Break out of the inner loop if the entity is found in the sentence - End If - Next - Next + ''' The @this to act on. + ''' The value. + ''' true if the value is in the string, false if not. + + Public Function Contains(this As String, value As String) As Boolean + Return this.IndexOf(value) <> -1 + End Function - If WithContext Then - Dim sentencesWithContext As New List(Of String) - For Each sentence In importantSentences - sentencesWithContext.AddRange(ExtractContextSentences(Text, sentence, NumberOfContextSentencesBefore, NumberOfContextSentencesAfter)) - Next - Return sentencesWithContext - Else - Return importantSentences - End If + ''' + ''' A string extension method that query if this object contains the given value. + ''' + ''' The @this to act on. + ''' The value. + ''' Type of the comparison. + ''' true if the value is in the string, false if not. + + Public Function Contains(this As String, value As String, comparisonType As StringComparison) As Boolean + Return this.IndexOf(value, comparisonType) <> -1 End Function ''' - ''' Given an important Sentence Extract its surrounding context Sentences + ''' Checks if String Contains Letters ''' - ''' - ''' Important Sentence to match - ''' Number of Sentences Either Side + ''' ''' - Public Function ExtractContextSentences(ByRef Text As String, ByRef ImportantSentence As String, ByRef ConTextInt As Integer) As List(Of String) - Dim ContextSentences As New List(Of String) - Dim CurrentSentences As New List(Of String) - Dim Count As Integer = 0 + + Public Function ContainsLetters(ByVal str As String) As Boolean - For Each Sent In Split(Text, ".") - CurrentSentences.Add(Sent) - Count += 1 - If Sent = ImportantSentence Then - 'Get Previous sentences + For i = 0 To str.Length - 1 + If Char.IsLetter(str.Chars(i)) Then + Return True + End If + Next - For i = 0 To ConTextInt - Dim Index = Count - 1 - If Index >= 0 Or Index < CurrentSentences.Count Then + Return False - ContextSentences.Add(CurrentSentences(Index)) + End Function - End If - Next - ContextSentences.Add(ImportantSentence) - 'GetFollowing Sentences - For i = 0 To ConTextInt - If Count + i < CurrentSentences.Count Then - ContextSentences.Add(CurrentSentences(Count + i)) - End If - Next - End If - Next - Return ContextSentences + ''' + ''' Counts the number of elements in the text, useful for declaring arrays when the element + ''' length is unknown could be used to split sentence on full stop Find Sentences then again + ''' on comma(conjunctions) "Find Clauses" NumberOfElements = CountElements(Userinput, delimiter) + ''' + ''' + ''' + ''' Integer : number of elements found + ''' + + Public Function CountElements(ByVal PHRASE As String, ByVal Delimiter As String) As Integer + Dim elementcounter As Integer = 0 + Dim PhraseArray As String() + PhraseArray = PHRASE.Split(Delimiter) + elementcounter = UBound(PhraseArray) + Return elementcounter End Function ''' - ''' Given an important Sentence Extract its surrounding context Sentences - - ''' In some cases it may be prudent to grab only a single sentence before and multiple sentences after - ''' important to know which context is important in which instance + ''' counts occurrences of a specific phoneme ''' - ''' Document - ''' Sentence to be matched - ''' number of - ''' number of + ''' + ''' ''' - Public Function ExtractContextSentences(ByVal text As String, ByVal importantSentence As String, ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As List(Of String) - Dim contextSentences As New List(Of String) - Dim allSentences As List(Of String) = text.Split("."c).ToList() - Dim sentenceIndex As Integer = allSentences.IndexOf(importantSentence) - - ' Get sentences before the important sentence - Dim startIndex As Integer = Math.Max(0, sentenceIndex - numContextSentencesBefore) - For i = startIndex To sentenceIndex - 1 - contextSentences.Add(allSentences(i)) - Next + ''' + + Public Function CountOccurrences(ByRef strIn As String, ByRef strFind As String) As Integer + '** + ' Returns: the number of times a string appears in a string + ' + '@rem Example code for CountOccurrences() + ' + ' ' Counts the occurrences of "ow" in the supplied string. + ' + ' strTmp = "How now, brown cow" + ' Returns a value of 4 + ' + ' + 'Debug.Print "CountOccurrences(): there are " & CountOccurrences(strTmp, "ow") & + '" occurrences of 'ow'" & " in the string '" & strTmp & "'" + ' + '@param strIn Required. String. + '@param strFind Required. String. + '@return Long. - ' Add the important sentence - contextSentences.Add(importantSentence) + Dim lngPos As Integer + Dim lngWordCount As Integer - ' Get sentences after the important sentence - Dim endIndex As Integer = Math.Min(sentenceIndex + numContextSentencesAfter, allSentences.Count - 1) - For i = sentenceIndex + 1 To endIndex - contextSentences.Add(allSentences(i)) - Next + On Error GoTo PROC_ERR - Return contextSentences - End Function + lngWordCount = 1 - Public Function GenerateTextFromEntities(entities As List(Of String), storedSentences As List(Of String)) As String - ' Implement your custom text generation logic here - ' Generate text using the entities and stored sentences + ' Find the first occurrence + lngPos = InStr(strIn, strFind) - Dim generatedText As String = "" + Do While lngPos > 0 + ' Find remaining occurrences + lngPos = InStr(lngPos + 1, strIn, strFind) + If lngPos > 0 Then + ' Increment the hit counter + lngWordCount = lngWordCount + 1 + End If + Loop - ' Example text generation logic - For Each entity As String In entities - Dim matchingSentences As List(Of String) = FindSentencesWithEntity(entity, storedSentences) + ' Return the value + CountOccurrences = lngWordCount - ' Randomly select a sentence from the matching sentences - Dim random As New Random() - Dim selectedSentence As String = matchingSentences(random.Next(0, matchingSentences.Count)) +PROC_EXIT: + Exit Function - ' Replace the entity tag with the actual entity in the selected sentence - Dim generatedSentence As String = selectedSentence.Replace(entity, "<<" & entity & ">>") +PROC_ERR: + MsgBox("Error: " & Err.Number & ". " & Err.Description, , NameOf(CountOccurrences)) + Resume PROC_EXIT - ' Append the generated sentence to the generated text - generatedText &= generatedSentence & " " - Next + End Function + + + Public Function CountVowels(ByVal InputString As String) As Integer + Dim v(9) As String 'Declare an array of 10 elements 0 to 9 + Dim vcount As Short 'This variable will contain number of vowels + Dim flag As Integer + Dim strLen As Integer + Dim i As Integer + v(0) = "a" 'First element of array is assigned small a + v(1) = "i" + v(2) = "o" + v(3) = "u" + v(4) = "e" + v(5) = "A" 'Sixth element is assigned Capital A + v(6) = "I" + v(7) = "O" + v(8) = "U" + v(9) = "E" + strLen = Len(InputString) + + For flag = 1 To strLen 'It will get every letter of entered string and loop + 'will terminate when all letters have been examined + + For i = 0 To 9 'Takes every element of v(9) one by one + 'Check if current letter is a vowel + If Mid(InputString, flag, 1) = v(i) Then + vcount = vcount + 1 ' If letter is equal to vowel + 'then increment vcount by 1 + End If + Next i 'Consider next value of v(i) + Next flag 'Consider next letter of the entered string + + CountVowels = vcount - Return generatedText.Trim() End Function - Public Function FindSentencesWithEntity(entity As String, storedSentences As List(Of String)) As List(Of String) - ' Implement your custom logic to find sentences that contain the given entity - ' Return a list of sentences that match the entity + ''' + ''' Counts tokens in string + ''' + ''' string to be searched + ''' delimiter such as space comma etc + ''' + + Public Function CountTokensInString(ByRef Str As String, ByRef Delimiter As String) As Integer + Dim Words() As String = Split(Str, Delimiter) + Return Words.Count + End Function - Dim matchingSentences As New List(Of String) + ''' + ''' Counts the words in a given text + ''' + ''' + ''' integer: number of words + ''' + + Public Function CountWords(NewText As String) As Integer + Dim TempArray() As String = NewText.Split(" ") + CountWords = UBound(TempArray) + Return CountWords + End Function - ' Example logic: Check if the entity appears in each stored sentence - For Each sentence As String In storedSentences - If sentence.Contains(entity) Then - matchingSentences.Add(sentence) + ''' + ''' checks Str contains keyword regardless of case + ''' + ''' + ''' + ''' + + Public Function DetectKeyWord(ByRef Userinput As String, ByRef Keyword As String) As Boolean + Dim mfound As Boolean = False + If UCase(Userinput).Contains(UCase(Keyword)) = True Or + InStr(Userinput, Keyword) > 1 Then + mfound = True + End If + + Return mfound + End Function + + ''' + ''' DETECT IF STATMENT IS AN IF/THEN DETECT IF STATMENT IS AN IF/THEN -- -RETURNS PARTS DETIFTHEN + ''' = DETECTLOGIC(USERINPUT, "IF", "THEN", IFPART, THENPART) + ''' + ''' + ''' "IF", can also be replace by "IT CAN BE SAID THAT + ''' "THEN" can also be replaced by "it must follow that" + ''' supply empty string to be used to hold part + ''' supply empty string to be used to hold part + ''' true/false + ''' + + Public Function DetectLOGIC(ByRef userinput As String, ByRef LOGICA As String, ByRef LOGICB As String, ByRef IFPART As String, ByRef THENPART As String) As Boolean + If InStr(1, userinput, LOGICA, 1) > 0 And InStr(1, userinput, " " & LOGICB & " ", 1) > 0 Then + 'SPLIT USER INPUT + Call SplitPhrase(userinput, " " & LOGICB & " ", IFPART, THENPART) + + IFPART = Replace(IFPART, LOGICA, "", 1, -1, CompareMethod.Text) + THENPART = Replace(THENPART, " " & LOGICB & " ", "", 1, -1, CompareMethod.Text) + DetectLOGIC = True + Else + DetectLOGIC = False + End If + End Function + + ''' + ''' Expand a string such as a field name by inserting a space ahead of each capitalized + ''' letter (where none exists). + ''' + ''' + ''' Expanded string + ''' + + Public Function ExpandToWords(ByVal inputString As String) As String + If inputString Is Nothing Then Return Nothing + Dim charArray = inputString.ToCharArray + Dim outStringBuilder As New System.Text.StringBuilder(inputString.Length + 10) + For index = 0 To charArray.GetUpperBound(0) + If Char.IsUpper(charArray(index)) Then + 'If previous character is also uppercase, don't expand as this may be an acronym. + If (index > 0) AndAlso Char.IsUpper(charArray(index - 1)) Then + outStringBuilder.Append(charArray(index)) + Else + outStringBuilder.Append(String.Concat(" ", charArray(index))) + End If + Else + outStringBuilder.Append(charArray(index)) End If Next - Return matchingSentences + Return outStringBuilder.ToString.Replace("_", " ").Trim + End Function - End Class - ''' - ''' TO USE THE PROGRAM CALL THE FUNCTION PORTERALGORITHM. THE WORD - ''' TO BE STEMMED SHOULD BE PASSED AS THE ARGUEMENT ARGUEMENT. THE STRING - ''' RETURNED BY THE FUNCTION IS THE STEMMED WORD - ''' Porter Stemmer. It follows the algorithm definition - ''' presented in : - ''' Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - ''' no. 3, pp 130-137, - ''' - Public Class WordStemmer + ''' + ''' A string extension method that extracts this object. + ''' + ''' The @this to act on. + ''' The predicate. + ''' A string. + + Public Function Extract(this As String, predicate As Func(Of Char, Boolean)) As String + Return New String(this.ToCharArray().Where(predicate).ToArray()) + End Function - ' (http://www.tartarus.org/~martin/PorterStemmer) + + Public Function ExtractFirstChar(ByRef InputStr As String) As String - 'Author : Navonil Mustafee - 'Brunel University - student - 'Algorithm Implemented as part for assignment on document visualization + ExtractFirstChar = Left(InputStr, 1) + End Function - Public Shared Function StemWord(str As String) As String + + Public Function ExtractFirstWord(ByRef Statement As String) As String + Dim StrArr() As String = Split(Statement, " ") + Return StrArr(0) + End Function - 'only strings greater than 2 are stemmed - If Len(Trim(str)) > 0 Then - str = porterAlgorithmStep1(str) - str = porterAlgorithmStep2(str) - str = porterAlgorithmStep3(str) - str = porterAlgorithmStep4(str) - str = porterAlgorithmStep5(str) - End If + + Public Function ExtractLastChar(ByRef InputStr As String) As String - 'End of Porter's algorithm.........returning the word - StemWord = str + ExtractLastChar = Right(InputStr, 1) + End Function + ''' + ''' Returns The last word in String + ''' NOTE: String ois converted to Array then the last element is extracted Count-1 + ''' + ''' + ''' String + + Public Function ExtractLastWord(ByRef InputStr As String) As String + Dim TempArr() As String = Split(InputStr, " ") + Dim Count As Integer = TempArr.Count - 1 + Return TempArr(Count) End Function - Private Shared Function porterAlgorithmStep1(str As String) As String + ''' + ''' A string extension method that extracts the letter described by @this. + ''' + ''' The @this to act on. + ''' The extracted letter. + + Public Function ExtractLetter(this As String) As String + Return New String(this.ToCharArray().Where(Function(x) [Char].IsLetter(x)).ToArray()) + End Function - On Error Resume Next + ''' + ''' A string extension method that extracts the number described by @this. + ''' + ''' The @this to act on. + ''' The extracted number. + + Public Function ExtractNumber(this As String) As String + Return New String(this.ToCharArray().Where(Function(x) [Char].IsNumber(x)).ToArray()) + End Function - 'STEP 1A - ' - ' SSES -> SS caresses -> caress - ' IES -> I ponies -> poni - ' ties -> ti - ' SS -> SS caress -> caress - ' S -> cats -> cat + ''' + ''' extracts string between defined strings + ''' + ''' base sgtring + ''' Start string + ''' End string + ''' + + Public Function ExtractStringBetween(ByVal value As String, ByVal strStart As String, ByVal strEnd As String) As String + If Not String.IsNullOrEmpty(value) Then + Dim i As Integer = value.IndexOf(strStart) + Dim j As Integer = value.IndexOf(strEnd) + Return value.Substring(i, j - i) + Else + Return value + End If + End Function - 'declaring local variables - Dim i As Byte - Dim j As Byte - Dim step1a(3, 1) As String + ''' + ''' Extract words Either side of Divider + ''' + ''' + ''' + ''' Front = F Back =B + ''' + + Public Function ExtractWordsEitherSide(ByRef TextStr As String, ByRef Divider As String, ByRef Mode As String) As String + ExtractWordsEitherSide = "" + Select Case Mode + Case "F" + Return ExtractWordsEitherSide(TextStr, Divider, "F") + Case "B" + Return ExtractWordsEitherSide(TextStr, Divider, "B") + End Select - 'initializing contents of 2D array - step1a(0, 0) = "sses" - step1a(0, 1) = "ss" - step1a(1, 0) = "ies" - step1a(1, 1) = "i" - step1a(2, 0) = "ss" - step1a(2, 1) = "ss" - step1a(3, 0) = "s" - step1a(3, 1) = "" + End Function - 'checking word - For i = 0 To 3 Step 1 - If porterEndsWith(str, step1a(i, 0)) Then - str = porterTrimEnd(str, Len(step1a(i, 0))) - str = porterAppendEnd(str, step1a(i, 1)) - Exit For - End If - Next i + ' Generate a random number based on the upper and lower bounds of the array, + 'then use that to return the item. + + Public Function FetchRandomItem(Of t)(ByRef theArray() As t) As t - '-------------------------------------------------------------------------------------------------------- + Dim randNumberGenerator As New Random + Randomize() + Dim index As Integer = randNumberGenerator.Next(theArray.GetLowerBound(0), + theArray.GetUpperBound(0) + 1) - 'STEP 1B - ' - ' If - ' (m>0) EED -> EE feed -> feed - ' agreed -> agree - ' Else - ' (*v*) ED -> plastered -> plaster - ' bled -> bled - ' (*v*) ING -> motoring -> motor - ' sing -> sing - ' - 'If the second or third of the rules in Step 1b is successful, the following - 'is done: - ' - ' AT -> ATE conflat(ed) -> conflate - ' BL -> BLE troubl(ed) -> trouble - ' IZ -> IZE siz(ed) -> size - ' (*d and not (*L or *S or *Z)) - ' -> single letter - ' hopp(ing) -> hop - ' tann(ed) -> tan - ' fall(ing) -> fall - ' hiss(ing) -> hiss - ' fizz(ed) -> fizz - ' (m=1 and *o) -> E fail(ing) -> fail - ' fil(ing) -> file - ' - 'The rule to map to a single letter causes the removal of one of the double - 'letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes - '-ATE, -BLE and -IZE can be recognised later. This E may be removed in step - '4. + Return theArray(index) - 'declaring local variables - Dim m As Byte - Dim temp As String - Dim second_third_success As Boolean + End Function - 'initializing contents of 2D array - second_third_success = False + ''' + ''' Define the search terms. This list could also be dynamically populated at runtime Find + ''' sentences that contain all the terms in the wordsToMatch array Note that the number of + ''' terms to match is not specified at compile time + ''' + ''' String to be searched + ''' List of Words to be detected + ''' Sentences containing words + + Public Function FindSentencesContaining(ByRef TextStr1 As String, ByRef Words As List(Of String)) As List(Of String) + ' Split the text block into an array of sentences. + Dim sentences As String() = TextStr1.Split(New Char() {".", "?", "!"}) + + Dim wordsToMatch(Words.Count) As String + Dim I As Integer = 0 + For Each item In Words + wordsToMatch(I) = item + I += 1 + Next - '(m>0) EED -> EE..else..(*v*) ED ->(*v*) ING -> - If porterEndsWith(str, "eed") Then + Dim sentenceQuery = From sentence In sentences + Let w = sentence.Split(New Char() {" ", ",", ".", ";", ":"}, + StringSplitOptions.RemoveEmptyEntries) + Where w.Distinct().Intersect(wordsToMatch).Count = wordsToMatch.Count() + Select sentence - 'counting the number of m's - temp = porterTrimEnd(str, Len("eed")) - m = porterCountm(temp) + ' Execute the query - If m > 0 Then - str = porterTrimEnd(str, Len("eed")) - str = porterAppendEnd(str, "ee") - End If + Dim StrList As New List(Of String) + For Each str As String In sentenceQuery + StrList.Add(str) + Next + Return StrList + End Function - ElseIf porterEndsWith(str, "ed") Then + + Public Function FormatJsonOutput(ByVal jsonString As String) As String + Dim stringBuilder = New StringBuilder() + Dim escaping As Boolean = False + Dim inQuotes As Boolean = False + Dim indentation As Integer = 0 - 'trim and check for vowel - temp = porterTrimEnd(str, Len("ed")) + For Each character As Char In jsonString - If porterContainsVowel(temp) Then - str = porterTrimEnd(str, Len("ed")) - second_third_success = True - End If + If escaping Then + escaping = False + stringBuilder.Append(character) + Else - ElseIf porterEndsWith(str, "ing") Then + If character = "\"c Then + escaping = True + stringBuilder.Append(character) + ElseIf character = """"c Then + inQuotes = Not inQuotes + stringBuilder.Append(character) + ElseIf Not inQuotes Then + + If character = ","c Then + stringBuilder.Append(character) + stringBuilder.Append(vbCrLf) + stringBuilder.Append(vbTab, indentation) + ElseIf character = "["c OrElse character = "{"c Then + stringBuilder.Append(character) + stringBuilder.Append(vbCrLf) + stringBuilder.Append(vbTab, System.Threading.Interlocked.Increment(indentation)) + ElseIf character = "]"c OrElse character = "}"c Then + stringBuilder.Append(vbCrLf) + stringBuilder.Append(vbTab, System.Threading.Interlocked.Decrement(indentation)) + stringBuilder.Append(character) + ElseIf character = ":"c Then + stringBuilder.Append(character) + stringBuilder.Append(vbTab) + ElseIf Not Char.IsWhiteSpace(character) Then + stringBuilder.Append(character) + End If + Else + stringBuilder.Append(character) + End If + End If + Next - 'trim and check for vowel - temp = porterTrimEnd(str, Len("ing")) + Return stringBuilder.ToString() + End Function - If porterContainsVowel(temp) Then - str = porterTrimEnd(str, Len("ing")) - second_third_success = True - End If + + Public Function FormatText(ByRef Text As String) As String + Dim FormatTextResponse As String = "" + 'FORMAT USERINPUT + 'turn to uppercase for searching the db + Text = LTrim(Text) + Text = RTrim(Text) + Text = UCase(Text) + + FormatTextResponse = Text + Return FormatTextResponse + End Function + ''' + ''' Gets the string after the given string parameter. + ''' + ''' The default value. + ''' The given string parameter. + ''' + ''' Unlike GetBefore, this method trims the result + + Public Function GetAfter(value As String, x As String) As String + Dim xPos = value.LastIndexOf(x, StringComparison.Ordinal) + If xPos = -1 Then + Return [String].Empty End If + Dim startIndex = xPos + x.Length + Return If(startIndex >= value.Length, [String].Empty, value.Substring(startIndex).Trim()) + End Function - 'If the second or third of the rules in Step 1b is SUCCESSFUL, the following - 'is done: - ' - ' AT -> ATE conflat(ed) -> conflate - ' BL -> BLE troubl(ed) -> trouble - ' IZ -> IZE siz(ed) -> size - ' (*d and not (*L or *S or *Z)) - ' -> single letter - ' hopp(ing) -> hop - ' tann(ed) -> tan - ' fall(ing) -> fall - ' hiss(ing) -> hiss - ' fizz(ed) -> fizz - ' (m=1 and *o) -> E fail(ing) -> fail - ' fil(ing) -> file + ''' + ''' Gets the string before the given string parameter. + ''' + ''' The default value. + ''' The given string parameter. + ''' + ''' Unlike GetBetween and GetAfter, this does not Trim the result. + + Public Function GetBefore(value As String, x As String) As String + Dim xPos = value.IndexOf(x, StringComparison.Ordinal) + Return If(xPos = -1, [String].Empty, value.Substring(0, xPos)) + End Function - If second_third_success = True Then 'If the second or third of the rules in Step 1b is SUCCESSFUL + ''' + ''' Gets the string between the given string parameters. + ''' + ''' The source value. + ''' The left string sentinel. + ''' The right string sentinel + ''' + ''' Unlike GetBefore, this method trims the result + + Public Function GetBetween(value As String, x As String, y As String) As String + Dim xPos = value.IndexOf(x, StringComparison.Ordinal) + Dim yPos = value.LastIndexOf(y, StringComparison.Ordinal) + If xPos = -1 OrElse xPos = -1 Then + Return [String].Empty + End If + Dim startIndex = xPos + x.Length + Return If(startIndex >= yPos, [String].Empty, value.Substring(startIndex, yPos - startIndex).Trim()) + End Function - If porterEndsWith(str, "at") Then 'AT -> ATE - str = porterTrimEnd(str, Len("at")) - str = porterAppendEnd(str, "ate") - ElseIf porterEndsWith(str, "bl") Then 'BL -> BLE - str = porterTrimEnd(str, Len("bl")) - str = porterAppendEnd(str, "ble") - ElseIf porterEndsWith(str, "iz") Then 'IZ -> IZE - str = porterTrimEnd(str, Len("iz")) - str = porterAppendEnd(str, "ize") - ElseIf porterEndsDoubleConsonent(str) Then '(*d and not (*L or *S or *Z))-> single letter - If Not (porterEndsWith(str, "l") Or porterEndsWith(str, "s") Or porterEndsWith(str, "z")) Then - str = porterTrimEnd(str, 1) - End If - ElseIf porterCountm(str) = 1 Then '(m=1 and *o) -> E - If porterEndsCVC(str) Then - str = porterAppendEnd(str, "e") - End If - End If + ''' + ''' Returns the first Word + ''' + ''' + ''' + + Public Function GetPrefix(ByRef Statement As String) As String + Dim StrArr() As String = Split(Statement, " ") + Return StrArr(0) + End Function + + + Public Function GetRandItemfromList(ByRef li As List(Of String)) As String + Randomize() + Return li.Item(Int(Rnd() * (li.Count - 1))) + End Function + + ''' + ''' Returns random character from string given length of the string to choose from + ''' + ''' + ''' + ''' + + Public Function GetRndChar(ByVal Source As String, ByVal Length As Integer) As String + Dim rnd As New Random + If Source Is Nothing Then Throw New ArgumentNullException(NameOf(Source), "Must contain a string,") + If Length <= 0 Then Throw New ArgumentException("Length must be a least one.", NameOf(Length)) + Dim s As String = "" + Dim builder As New System.Text.StringBuilder() + builder.Append(s) + For i = 1 To Length + builder.Append(Source(rnd.Next(0, Source.Length))) + Next + s = builder.ToString() + Return s + End Function + ''' + ''' Returns from index to end of file + ''' + ''' String + ''' Index + ''' + + Public Function GetSlice(ByRef Str As String, ByRef indx As Integer) As String + If indx <= Str.Length Then + Str.Substring(indx, Str.Length) + Return Str(indx) + Else End If + Return Nothing + End Function - '-------------------------------------------------------------------------------------------------------- - ' - 'STEP 1C - ' - ' (*v*) Y -> I happy -> happi - ' sky -> sky + ''' + ''' gets the last word + ''' + ''' + ''' + + Public Function GetSuffix(ByRef InputStr As String) As String + Dim TempArr() As String = Split(InputStr, " ") + Dim Count As Integer = TempArr.Count - 1 + Return TempArr(Count) + End Function - If porterEndsWith(str, "y") Then + + Public Function GetWordsBetween(ByRef InputStr As String, ByRef StartStr As String, ByRef StopStr As String) + Return InputStr.ExtractStringBetween(StartStr, StopStr) + End Function - 'trim and check for vowel - temp = porterTrimEnd(str, 1) + ''' + ''' A string extension method that query if '@this' satisfy the specified pattern. + ''' + ''' The @this to act on. + ''' The pattern to use. Use '*' as wildcard string. + ''' true if '@this' satisfy the specified pattern, false if not. + + Public Function IsLike(this As String, pattern As String) As Boolean + ' Turn the pattern into regex pattern, and match the whole string with ^$ + Dim regexPattern As String = "^" + Regex.Escape(pattern) + "$" + + ' Escape special character ?, #, *, [], and [!] + regexPattern = regexPattern.Replace("\[!", "[^").Replace("\[", "[").Replace("\]", "]").Replace("\?", ".").Replace("\*", ".*").Replace("\#", "\d") + + Return Regex.IsMatch(this, regexPattern) + End Function - If porterContainsVowel(temp) Then - str = porterTrimEnd(str, Len("y")) - str = porterAppendEnd(str, "i") - End If + ''' + ''' Checks if string is a reserved VBscipt Keyword + ''' + ''' + ''' + + Function IsReservedWord(ByVal keyword As String) As Boolean + Dim IsReserved = False + Select Case LCase(keyword) + Case "and" : IsReserved = True + Case "as" : IsReserved = True + Case "boolean" : IsReserved = True + Case "byref" : IsReserved = True + Case "byte" : IsReserved = True + Case "byval" : IsReserved = True + Case "call" : IsReserved = True + Case "case" : IsReserved = True + Case "class" : IsReserved = True + Case "const" : IsReserved = True + Case "currency" : IsReserved = True + Case "debug" : IsReserved = True + Case "dim" : IsReserved = True + Case "do" : IsReserved = True + Case "double" : IsReserved = True + Case "each" : IsReserved = True + Case "else" : IsReserved = True + Case "elseif" : IsReserved = True + Case "empty" : IsReserved = True + Case "end" : IsReserved = True + Case "endif" : IsReserved = True + Case "enum" : IsReserved = True + Case "eqv" : IsReserved = True + Case "event" : IsReserved = True + Case "exit" : IsReserved = True + Case "false" : IsReserved = True + Case "for" : IsReserved = True + Case "function" : IsReserved = True + Case "get" : IsReserved = True + Case "goto" : IsReserved = True + Case "if" : IsReserved = True + Case "imp" : IsReserved = True + Case "implements" : IsReserved = True + Case "in" : IsReserved = True + Case "integer" : IsReserved = True + Case "is" : IsReserved = True + Case "let" : IsReserved = True + Case "like" : IsReserved = True + Case "long" : IsReserved = True + Case "loop" : IsReserved = True + Case "lset" : IsReserved = True + Case "me" : IsReserved = True + Case "mod" : IsReserved = True + Case "new" : IsReserved = True + Case "next" : IsReserved = True + Case "not" : IsReserved = True + Case "nothing" : IsReserved = True + Case "null" : IsReserved = True + Case "on" : IsReserved = True + Case "option" : IsReserved = True + Case "optional" : IsReserved = True + Case "or" : IsReserved = True + Case "paramarray" : IsReserved = True + Case "preserve" : IsReserved = True + Case "private" : IsReserved = True + Case "public" : IsReserved = True + Case "raiseevent" : IsReserved = True + Case "redim" : IsReserved = True + Case "rem" : IsReserved = True + Case "resume" : IsReserved = True + Case "rset" : IsReserved = True + Case "select" : IsReserved = True + Case "set" : IsReserved = True + Case "shared" : IsReserved = True + Case "single" : IsReserved = True + Case "static" : IsReserved = True + Case "stop" : IsReserved = True + Case "sub" : IsReserved = True + Case "then" : IsReserved = True + Case "to" : IsReserved = True + Case "true" : IsReserved = True + Case "type" : IsReserved = True + Case "typeof" : IsReserved = True + Case "until" : IsReserved = True + Case "variant" : IsReserved = True + Case "wend" : IsReserved = True + Case "while" : IsReserved = True + Case "with" : IsReserved = True + Case "xor" : IsReserved = True + End Select + Return IsReserved + End Function - End If + ''' + ''' Returns Propercase Sentence + ''' + ''' String to be formatted + ''' + + Public Function ProperCase(ByRef TheString As String) As String + ProperCase = UCase(Left(TheString, 1)) - 'retuning the word - porterAlgorithmStep1 = str + For i = 2 To Len(TheString) + ProperCase = If(Mid(TheString, i - 1, 1) = " ", ProperCase & UCase(Mid(TheString, i, 1)), ProperCase & LCase(Mid(TheString, i, 1))) + Next i End Function - Private Shared Function porterAlgorithmStep2(str As String) As String + + Public Function RemoveBrackets(ByRef Txt As String) As String + 'Brackets + Txt = Txt.Replace("(", "") + Txt = Txt.Replace("{", "") + Txt = Txt.Replace("}", "") + Txt = Txt.Replace("[", "") + Txt = Txt.Replace("]", "") + Return Txt + End Function - On Error Resume Next + + Public Function RemoveFullStop(ByRef MESSAGE As String) As String +Loop1: + If Right(MESSAGE, 1) = "." Then MESSAGE = Left(MESSAGE, Len(MESSAGE) - 1) : GoTo Loop1 + Return MESSAGE + End Function - 'STEP 2 - ' - ' (m>0) ATIONAL -> ATE relational -> relate - ' (m>0) TIONAL -> TION conditional -> condition - ' rational -> rational - ' (m>0) ENCI -> ENCE valenci -> valence - ' (m>0) ANCI -> ANCE hesitanci -> hesitance - ' (m>0) IZER -> IZE digitizer -> digitize - 'Also, - ' (m>0) BLI -> BLE conformabli -> conformable - ' - ' (m>0) ALLI -> AL radicalli -> radical - ' (m>0) ENTLI -> ENT differentli -> different - ' (m>0) ELI -> E vileli - > vile - ' (m>0) OUSLI -> OUS analogousli -> analogous - ' (m>0) IZATION -> IZE vietnamization -> vietnamize - ' (m>0) ATION -> ATE predication -> predicate - ' (m>0) ATOR -> ATE operator -> operate - ' (m>0) ALISM -> AL feudalism -> feudal - ' (m>0) IVENESS -> IVE decisiveness -> decisive - ' (m>0) FULNESS -> FUL hopefulness -> hopeful - ' (m>0) OUSNESS -> OUS callousness -> callous - ' (m>0) ALITI -> AL formaliti -> formal - ' (m>0) IVITI -> IVE sensitiviti -> sensitive - ' (m>0) BILITI -> BLE sensibiliti -> sensible - 'Also, - ' (m>0) LOGI -> LOG apologi -> apolog - ' - 'The test for the string S1 can be made fast by doing a program switch on - 'the penultimate letter of the word being tested. This gives a fairly even - 'breakdown of the possible values of the string S1. It will be seen in fact - 'that the S1-strings in step 2 are presented here in the alphabetical order - 'of their penultimate letter. Similar techniques may be applied in the other - 'steps. + ''' + ''' A string extension method that removes the letter described by @this. + ''' + ''' The @this to act on. + ''' A string. + + Public Function RemoveLetter(this As String) As String + Return New String(this.ToCharArray().Where(Function(x) Not [Char].IsLetter(x)).ToArray()) + End Function - 'declaring local variables - Dim step2(20, 1) As String - Dim i As Byte - Dim temp As String + + Public Function RemoveMathsSymbols(ByRef Txt As String) As String + 'Maths Symbols + Txt = Txt.Replace("+", "") + Txt = Txt.Replace("=", "") + Txt = Txt.Replace("-", "") + Txt = Txt.Replace("/", "") + Txt = Txt.Replace("*", "") + Txt = Txt.Replace("<", "") + Txt = Txt.Replace(">", "") + Txt = Txt.Replace("%", "") + Return Txt + End Function - 'initializing contents of 2D array - step2(0, 0) = "ational" - step2(0, 1) = "ate" - step2(1, 0) = "tional" - step2(1, 1) = "tion" - step2(2, 0) = "enci" - step2(2, 1) = "ence" - step2(3, 0) = "anci" - step2(3, 1) = "ance" - step2(4, 0) = "izer" - step2(4, 1) = "ize" - step2(5, 0) = "bli" - step2(5, 1) = "ble" - step2(6, 0) = "alli" - step2(6, 1) = "al" - step2(7, 0) = "entli" - step2(7, 1) = "ent" - step2(8, 0) = "eli" - step2(8, 1) = "e" - step2(9, 0) = "ousli" - step2(9, 1) = "ous" - step2(10, 0) = "ization" - step2(10, 1) = "ize" - step2(11, 0) = "ation" - step2(11, 1) = "ate" - step2(12, 0) = "ator" - step2(12, 1) = "ate" - step2(13, 0) = "alism" - step2(13, 1) = "al" - step2(14, 0) = "iveness" - step2(14, 1) = "ive" - step2(15, 0) = "fulness" - step2(15, 1) = "ful" - step2(16, 0) = "ousness" - step2(16, 1) = "ous" - step2(17, 0) = "aliti" - step2(17, 1) = "al" - step2(18, 0) = "iviti" - step2(18, 1) = "ive" - step2(19, 0) = "biliti" - step2(19, 1) = "ble" - step2(20, 0) = "logi" - step2(20, 1) = "log" + ''' + ''' A string extension method that removes the number described by @this. + ''' + ''' The @this to act on. + ''' A string. + + Public Function RemoveNumber(this As String) As String + Return New String(this.ToCharArray().Where(Function(x) Not [Char].IsNumber(x)).ToArray()) + End Function - 'checking word - For i = 0 To 20 Step 1 - If porterEndsWith(str, step2(i, 0)) Then - temp = porterTrimEnd(str, Len(step2(i, 0))) - If porterCountm(temp) > 0 Then - str = porterTrimEnd(str, Len(step2(i, 0))) - str = porterAppendEnd(str, step2(i, 1)) - End If - Exit For - End If - Next i + + Public Function RemovePunctuation(ByRef Txt As String) As String + 'Punctuation + Txt = Txt.Replace(",", "") + Txt = Txt.Replace(".", "") + Txt = Txt.Replace(";", "") + Txt = Txt.Replace("'", "") + Txt = Txt.Replace("_", "") + Txt = Txt.Replace("?", "") + Txt = Txt.Replace("!", "") + Txt = Txt.Replace("&", "") + Txt = Txt.Replace(":", "") + + Return Txt + End Function - 'retuning the word - porterAlgorithmStep2 = str + ''' + ''' Removes StopWords from sentence + ''' ARAB/ENG/DUTCH/FRENCH/SPANISH/ITALIAN + ''' Hopefully leaving just relevant words in the user sentence + ''' Currently Under Revision (takes too many words) + ''' + ''' + ''' + + Public Function RemoveStopWords(ByRef Userinput As String) As String + ' Userinput = LCase(Userinput).Replace("the", "r") + For Each item In StopWordsENG + Userinput = LCase(Userinput).Replace(item, "") + Next + For Each item In StopWordsArab + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsDutch + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsFrench + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsItalian + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsSpanish + Userinput = Userinput.Replace(item, "") + Next + Return Userinput + End Function + + Public Function RemoveStopWords(ByRef txt As String, ByRef StopWrds As List(Of String)) As String + For Each item In StopWrds + txt = txt.Replace(item, "") + Next + Return txt End Function - Private Shared Function porterAlgorithmStep3(str As String) As String + + Public Function RemoveSymbols(ByRef Txt As String) As String + 'Basic Symbols + Txt = Txt.Replace("£", "") + Txt = Txt.Replace("$", "") + Txt = Txt.Replace("^", "") + Txt = Txt.Replace("@", "") + Txt = Txt.Replace("#", "") + Txt = Txt.Replace("~", "") + Txt = Txt.Replace("\", "") + Return Txt + End Function - On Error Resume Next + ''' + ''' A string extension method that removes the letter. + ''' + ''' The @this to act on. + ''' The predicate. + ''' A string. + + Public Function RemoveWhere(this As String, predicate As Func(Of Char, Boolean)) As String + Return New String(this.ToCharArray().Where(Function(x) Not predicate(x)).ToArray()) + End Function - 'STEP 3 - ' - ' (m>0) ICATE -> IC triplicate -> triplic - ' (m>0) ATIVE -> formative -> form - ' (m>0) ALIZE -> AL formalize -> formal - ' (m>0) ICITI -> IC electriciti -> electric - ' (m>0) ICAL -> IC electrical -> electric - ' (m>0) FUL -> hopeful -> hope - ' (m>0) NESS -> goodness -> good + ''' + ''' Advanced search String pattern Wildcard denotes which position 1st =1 or 2nd =2 Send + ''' Original input > Search pattern to be used > Wildcard requred SPattern = "WHAT + ''' COLOUR DO YOU LIKE * OR *" Textstr = "WHAT COLOUR DO YOU LIKE red OR black" ITEM_FOUND = + ''' = SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS RED ITEM_FOUND = = + ''' SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS black + ''' + ''' + ''' TextStr Required. String.EG: "WHAT COLOUR DO YOU LIKE red OR black" + ''' + ''' + ''' SPattern Required. String.EG: "WHAT COLOUR DO YOU LIKE * OR *" + ''' + ''' Wildcard Required. Integer.EG: 1st =1 or 2nd =2 + ''' + ''' * in search pattern + + Public Function SearchPattern(ByRef TextSTR As String, ByRef SPattern As String, ByRef Wildcard As Short) As String + Dim SearchP2 As String + Dim SearchP1 As String + Dim TextStrp3 As String + Dim TextStrp4 As String + SearchPattern = "" + SearchP2 = "" + SearchP1 = "" + TextStrp3 = "" + TextStrp4 = "" + If TextSTR Like SPattern = True Then + Select Case Wildcard + Case 1 + Call SplitPhrase(SPattern, "*", SearchP1, SearchP2) + TextSTR = Replace(TextSTR, SearchP1, "", 1, -1, CompareMethod.Text) + + SearchP2 = Replace(SearchP2, "*", "", 1, -1, CompareMethod.Text) + Call SplitPhrase(TextSTR, SearchP2, TextStrp3, TextStrp4) + + TextSTR = TextStrp3 + + Case 2 + Call SplitPhrase(SPattern, "*", SearchP1, SearchP2) + SPattern = Replace(SPattern, SearchP1, " ", 1, -1, CompareMethod.Text) + TextSTR = Replace(TextSTR, SearchP1, " ", 1, -1, CompareMethod.Text) + + Call SplitPhrase(SearchP2, "*", SearchP1, SearchP2) + Call SplitPhrase(TextSTR, SearchP1, TextStrp3, TextStrp4) + + TextSTR = TextStrp4 + + End Select + + SearchPattern = TextSTR + LTrim(SearchPattern) + RTrim(SearchPattern) + Else + End If - 'declaring local variables - Dim i As Byte - Dim temp As String - Dim step3(6, 1) As String + End Function - 'initializing contents of 2D array - step3(0, 0) = "icate" - step3(0, 1) = "ic" - step3(1, 0) = "ative" - step3(1, 1) = "" - step3(2, 0) = "alize" - step3(2, 1) = "al" - step3(3, 0) = "iciti" - step3(3, 1) = "ic" - step3(4, 0) = "ical" - step3(4, 1) = "ic" - step3(5, 0) = "ful" - step3(5, 1) = "" - step3(6, 0) = "ness" - step3(6, 1) = "" + ''' + ''' Advanced search String pattern Wildcard denotes which position 1st =1 or 2nd =2 Send + ''' Original input > Search pattern to be used > Wildcard requred SPattern = "WHAT + ''' COLOUR DO YOU LIKE * OR *" Textstr = "WHAT COLOUR DO YOU LIKE red OR black" ITEM_FOUND = + ''' = SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS RED ITEM_FOUND = = + ''' SearchPattern(USERINPUT, SPattern, 2) ---- RETURNS black + ''' + ''' TextStr = "Pick Red OR Blue" . String. + ''' Search String = ("Pick * OR *") String. + ''' Wildcard Required. Integer. = 1= Red / 2= Blue + ''' + ''' finds the * in search pattern + + Public Function SearchStringbyPattern(ByRef TextSTR As String, ByRef SPattern As String, ByRef Wildcard As Short) As String + Dim SearchP2 As String + Dim SearchP1 As String + Dim TextStrp3 As String + Dim TextStrp4 As String + SearchStringbyPattern = "" + SearchP2 = "" + SearchP1 = "" + TextStrp3 = "" + TextStrp4 = "" + If TextSTR Like SPattern = True Then + Select Case Wildcard + Case 1 + Call SplitString(SPattern, "*", SearchP1, SearchP2) + TextSTR = Replace(TextSTR, SearchP1, "", 1, -1, CompareMethod.Text) + + SearchP2 = Replace(SearchP2, "*", "", 1, -1, CompareMethod.Text) + Call SplitString(TextSTR, SearchP2, TextStrp3, TextStrp4) + + TextSTR = TextStrp3 + + Case 2 + Call SplitString(SPattern, "*", SearchP1, SearchP2) + SPattern = Replace(SPattern, SearchP1, " ", 1, -1, CompareMethod.Text) + TextSTR = Replace(TextSTR, SearchP1, " ", 1, -1, CompareMethod.Text) + + Call SplitString(SearchP2, "*", SearchP1, SearchP2) + Call SplitString(TextSTR, SearchP1, TextStrp3, TextStrp4) + + TextSTR = TextStrp4 + + End Select + + SearchStringbyPattern = TextSTR + LTrim(SearchStringbyPattern) + RTrim(SearchStringbyPattern) + Else + End If - 'checking word - For i = 0 To 6 Step 1 - If porterEndsWith(str, step3(i, 0)) Then - temp = porterTrimEnd(str, Len(step3(i, 0))) - If porterCountm(temp) > 0 Then - str = porterTrimEnd(str, Len(step3(i, 0))) - str = porterAppendEnd(str, step3(i, 1)) - End If - Exit For - End If - Next i + End Function - 'retuning the word - porterAlgorithmStep3 = str + + Public Function SpaceItems(ByRef txt As String, Item As String) As String + Return txt.Replace(Item, " " & Item & " ") + End Function + + Public Function SpacePunctuation(ByRef Txt As String) As String + For Each item In Symbols + Txt = SpaceItems(Txt, item) + Next + For Each item In EncapuslationPunctuationEnd + Txt = SpaceItems(Txt, item) + Next + For Each item In EncapuslationPunctuationStart + Txt = SpaceItems(Txt, item) + Next + For Each item In GramaticalPunctuation + Txt = SpaceItems(Txt, item) + Next + For Each item In MathPunctuation + Txt = SpaceItems(Txt, item) + Next + For Each item In MoneyPunctuation + Txt = SpaceItems(Txt, item) + Next + Return Txt End Function - Private Shared Function porterAlgorithmStep4(str As String) As String + ''' + ''' SPLITS THE GIVEN PHRASE UP INTO TWO PARTS by dividing word SplitPhrase(Userinput, "and", + ''' Firstp, SecondP) + ''' + ''' Sentence to be divided + ''' String: Word to divide sentence by + ''' String: firstpart of sentence to be populated + ''' String: Secondpart of sentence to be populated + ''' + + Public Sub SplitPhrase(ByVal PHRASE As String, ByRef DIVIDINGWORD As String, ByRef FIRSTPART As String, ByRef SECONDPART As String) + Dim POS As Short + POS = InStr(PHRASE, DIVIDINGWORD) + If (POS > 0) Then + FIRSTPART = Trim(Left(PHRASE, POS - 1)) + SECONDPART = Trim(Right(PHRASE, Len(PHRASE) - POS - Len(DIVIDINGWORD) + 1)) + Else + FIRSTPART = "" + SECONDPART = PHRASE + End If + End Sub - On Error Resume Next + ''' + ''' SPLITS THE GIVEN PHRASE UP INTO TWO PARTS by dividing word SplitPhrase(Userinput, "and", + ''' Firstp, SecondP) + ''' + ''' String: Sentence to be divided + ''' String: Word to divide sentence by + ''' String-Returned : firstpart of sentence to be populated + ''' String-Returned : Secondpart of sentence to be populated + ''' + + Public Sub SplitString(ByVal PHRASE As String, ByRef DIVIDINGWORD As String, ByRef FIRSTPART As String, ByRef SECONDPART As String) + Dim POS As Short + 'Check Error + If DIVIDINGWORD IsNot Nothing And PHRASE IsNot Nothing Then + + POS = InStr(PHRASE, DIVIDINGWORD) + If (POS > 0) Then + FIRSTPART = Trim(Left(PHRASE, POS - 1)) + SECONDPART = Trim(Right(PHRASE, Len(PHRASE) - POS - Len(DIVIDINGWORD) + 1)) + Else + FIRSTPART = "" + SECONDPART = PHRASE + End If + Else - 'STEP 4 - ' - ' (m>1) AL -> revival -> reviv - ' (m>1) ANCE -> allowance -> allow - ' (m>1) ENCE -> inference -> infer - ' (m>1) ER -> airliner -> airlin - ' (m>1) IC -> gyroscopic -> gyroscop - ' (m>1) ABLE -> adjustable -> adjust - ' (m>1) IBLE -> defensible -> defens - ' (m>1) ANT -> irritant -> irrit - ' (m>1) EMENT -> replacement -> replac - ' (m>1) MENT -> adjustment -> adjust - ' (m>1) ENT -> dependent -> depend - ' (m>1 and (*S or *T)) ION -> adoption -> adopt - ' (m>1) OU -> homologou -> homolog - ' (m>1) ISM -> communism -> commun - ' (m>1) ATE -> activate -> activ - ' (m>1) ITI -> angulariti -> angular - ' (m>1) OUS -> homologous -> homolog - ' (m>1) IVE -> effective -> effect - ' (m>1) IZE -> bowdlerize -> bowdler - ' - 'The suffixes are now removed. All that remains is a little tidying up. + End If + End Sub - 'declaring local variables - Dim i As Byte - Dim temp As String - Dim step4(18) As String + ''' + ''' Split string to List of strings + ''' + ''' base string + ''' to be seperated by + ''' + + Public Function SplitToList(ByRef Str As String, ByVal Seperator As String) As List(Of String) + Dim lst As New List(Of String) + If Str <> "" = True And Seperator <> "" Then + + Dim Found() As String = Str.Split(Seperator) + For Each item In Found + lst.Add(item) + Next + Else - 'initializing contents of 2D array - step4(0) = "al" - step4(1) = "ance" - step4(2) = "ence" - step4(3) = "er" - step4(4) = "ic" - step4(5) = "able" - step4(6) = "ible" - step4(7) = "ant" - step4(8) = "ement" - step4(9) = "ment" - step4(10) = "ent" - step4(11) = "ion" - step4(12) = "ou" - step4(13) = "ism" - step4(14) = "ate" - step4(15) = "iti" - step4(16) = "ous" - step4(17) = "ive" - step4(18) = "ize" + End If + Return lst + End Function - 'checking word - For i = 0 To 18 Step 1 + ''' + ''' Returns a delimited string from the list. + ''' + ''' + ''' + ''' + + Public Function ToDelimitedString(ls As List(Of String), delimiter As String) As String + Dim sb As New StringBuilder + For Each buf As String In ls + sb.Append(buf) + sb.Append(delimiter) + Next + Return sb.ToString.Trim(CChar(delimiter)) + End Function - If porterEndsWith(str, step4(i)) Then + ''' + ''' Convert object to Json String + ''' + ''' + ''' + + Public Function ToJson(ByRef Item As Object) As String + Dim Converter As New JavaScriptSerializer + Return Converter.Serialize(Item) - temp = porterTrimEnd(str, Len(step4(i))) + End Function - If porterCountm(temp) > 1 Then + ''' + ''' Counts the vowels used (AEIOU) + ''' + ''' + ''' + ''' + + Public Function VowelCount(ByVal InputString As String) As Integer + Dim v(9) As String 'Declare an array of 10 elements 0 to 9 + Dim vcount As Short 'This variable will contain number of vowels + Dim flag As Integer + Dim strLen As Integer + Dim i As Integer + v(0) = "a" 'First element of array is assigned small a + v(1) = "i" + v(2) = "o" + v(3) = "u" + v(4) = "e" + v(5) = "A" 'Sixth element is assigned Capital A + v(6) = "I" + v(7) = "O" + v(8) = "U" + v(9) = "E" + strLen = Len(InputString) + + For flag = 1 To strLen 'It will get every letter of entered string and loop + 'will terminate when all letters have been examined + + For i = 0 To 9 'Takes every element of v(9) one by one + 'Check if current letter is a vowel + If Mid(InputString, flag, 1) = v(i) Then + vcount = vcount + 1 ' If letter is equal to vowel + 'then increment vcount by 1 + End If + Next i 'Consider next value of v(i) + Next flag 'Consider next letter of the enterd string - If porterEndsWith(str, "ion") Then - If porterEndsWith(temp, "s") Or porterEndsWith(temp, "t") Then - str = porterTrimEnd(str, Len(step4(i))) - str = porterAppendEnd(str, "") - End If - Else - str = porterTrimEnd(str, Len(step4(i))) - str = porterAppendEnd(str, "") - End If + VowelCount = vcount - End If + End Function - Exit For + End Module + + Public Module TextProcessingTasks + + + Public Function PerformTasks(ByRef Txt As String, ByRef Tasks As List(Of TextPreProcessingTasks)) As String + + For Each tsk In Tasks + Select Case tsk + + Case TextPreProcessingTasks.Space_Punctuation + + Txt = SpacePunctuation(Txt).Replace(" ", " ") + Case TextPreProcessingTasks.To_Upper + Txt = Txt.ToUpper.Replace(" ", " ") + Case TextPreProcessingTasks.To_Lower + Txt = Txt.ToLower.Replace(" ", " ") + Case TextPreProcessingTasks.Lemmatize_Text + Case TextPreProcessingTasks.Remove_Stop_Words + TextExtensions.RemoveStopWords(Txt) + Case TextPreProcessingTasks.Remove_Symbols + Txt = RemoveSymbols(Txt).Replace(" ", " ") + Case TextPreProcessingTasks.Remove_Brackets + Txt = RemoveBrackets(Txt).Replace(" ", " ") + Case TextPreProcessingTasks.Remove_Maths_Symbols + Txt = RemoveMathsSymbols(Txt).Replace(" ", " ") + Case TextPreProcessingTasks.Remove_Punctuation + Txt = RemovePunctuation(Txt).Replace(" ", " ") + Case TextPreProcessingTasks.AlphaNumeric_Only + Txt = AlphaNumericOnly(Txt).Replace(" ", " ") + End Select + Next - End If + Return Txt + End Function - Next i + Public Enum TextPreProcessingTasks + Space_Punctuation + To_Upper + To_Lower + Lemmatize_Text + Remove_Stop_Words + Remove_Symbols + Remove_Brackets + Remove_Maths_Symbols + Remove_Punctuation + AlphaNumeric_Only + End Enum + + End Module + + + Public Class SentenceSplitter + Public Const ClassId As String = "28993390-7702-401C-BAB3-38FF97BC1AC9" + Public Const EventsId As String = "CD334307-F53E-401A-AC6D-3CFDD86FD6F1" + Public Const InterfaceId As String = "8B3345B1-5D13-4059-829B-B531310144B5" - 'retuning the word - porterAlgorithmStep4 = str + ''' + ''' punctuation markers for end of sentences(individual thoughts) Set in order of Rank + ''' + Public Shared EndPunctuation() As String = {".", ";", "?", "!", ":"} - End Function + ''' + ''' Punctuation(known) + ''' + Public Shared Punctuation() As String = {".", ",", ";", "?", "!", ":", "$", "%", "^", "*", "<", ">", +"/", "@", "(", ")", "'""{", "}", "[", "]", "\", "|", "+", "=", "_", "-"} - Private Shared Function porterAlgorithmStep5(str As String) As String + Private mSent As List(Of String) - On Error Resume Next + ''' + ''' Provide text for sentence definition, + ''' + ''' + Public Sub New(ByVal Text As String) + mSent = SplitTextToSentences(Text) + End Sub - 'STEP 5a - ' - ' (m>1) E -> probate -> probat - ' rate -> rate - ' (m=1 and not *o) E -> cease -> ceas - ' - 'STEP 5b - ' - ' (m>1 and *d and *L) -> single letter - ' controll -> control - ' roll -> roll + ''' + ''' Returns number of sentences found + ''' + ''' + Public ReadOnly Property Count As Integer + Get + For Each Sent As String In Sentences + Count += 1 - 'declaring local variables - Dim i As Byte - Dim temp As String + Next + Return Count + End Get + End Property - 'Step5a - If porterEndsWith(str, "e") Then 'word ends with e - temp = porterTrimEnd(str, 1) - If porterCountm(temp) > 1 Then 'm>1 - str = porterTrimEnd(str, 1) - ElseIf porterCountm(temp) = 1 Then 'm=1 - If Not porterEndsCVC(temp) Then 'not *o - str = porterTrimEnd(str, 1) - End If - End If - End If + Public ReadOnly Property Sentences As List(Of String) + Get + Return mSent + End Get + End Property - '-------------------------------------------------------------------------------------------------------- - ' - 'Step5b - If porterCountm(str) > 1 Then - If porterEndsDoubleConsonent(str) And porterEndsWith(str, "l") Then - str = porterTrimEnd(str, 1) - End If - End If + ''' + ''' Removes Trailing Spaces as well as double spaces from Text Also the Text is Capitalized + ''' + ''' + ''' + Public Shared Function FormatText(ByRef Text As String) As String + Dim FormatTextResponse As String = "" + 'FORMAT USERINPUT + 'turn to uppercase for searching the db + Text = LTrim(Text) + Text = RTrim(Text) + Text = Text.Replace(" ", " ") + FormatTextResponse = Text + Return FormatTextResponse + End Function - 'retuning the word - porterAlgorithmStep5 = str + ''' + ''' finds sentences in text or phrase. based on EndPunctuation markers + ''' + ''' + ''' Returns a list of sentences defined in the text + Public Shared Function GetSentences(ByRef InputStr As String) As List(Of String) + GetSentences = New List(Of String) + Dim s As New SentenceSplitter(InputStr) + For Each Sent As String In s.Sentences + GetSentences.Add(Sent) + Next + End Function + ''' + ''' Removes Punctuation from Text + ''' + ''' + ''' Cleaned Text + Public Shared Function RemovePunctuation(ByVal Text As String) As String + Dim mText As String = Text + For Each item As String In Punctuation + mText = mText.Replace(item, " ") + Next + mText = mText.Replace(" ", " ") + Return mText End Function - Private Shared Function porterAppendEnd(str As String, ends As String) As String + ''' + ''' Splits Sentences by the Punctution offered. As it may be prudent to split by "." then + ''' after by "," for sub components of the sentence etc + ''' + ''' text to be examined + ''' Punctuation to be used as end marker + ''' + Public Shared Function SplitTextToSentences(ByVal mText As String, ByVal mEndPunctuation As String) As List(Of String) - On Error Resume Next + Dim Text As String = mText - 'returning the appended string - porterAppendEnd = str + ends + Text = Text.Replace(mEndPunctuation, "#") + + Dim TempSentencesArray() As String = Split(Text, "#") + Dim mSentences As New List(Of String) + For Each SentStr As String In TempSentencesArray + If SentStr <> "" Then + mSentences.Add(FormatText(SentStr)) + End If + Next + + Return mSentences End Function - Private Shared Function porterContains(str As String, present As String) As Boolean + ''' + ''' Splits to sentences based on all end markers in EndPunctuation + ''' + ''' + ''' + Private Function SplitTextToSentences(ByVal mText As String) As List(Of String) - On Error Resume Next + Dim Text As String = mText + For Each item As String In EndPunctuation + Text = Text.Replace(item, "#") - 'checking whether strr contains present - porterContains = If(InStr(str, present) = 0, False, True) + Next + Dim TempSentencesArray() As String = Split(Text, "#") + Dim mSentences As New List(Of String) + For Each SentStr As String In TempSentencesArray + If SentStr <> "" Then + mSentences.Add(FormatText(SentStr)) + End If + + Next + Return mSentences End Function - Private Shared Function porterContainsVowel(str As String) As Boolean + End Class - 'checking word to see if vowels are present - Dim pattern As String + Public Class Summarise - If Len(str) >= 0 Then + Public Function GenerateSummary(ByRef Text As String, ByRef Entitys As List(Of String)) As String + ' Step 5: Generate the summary + Return String.Join(vbNewLine, ExtractImportantSentencesInText(Text, Entitys, True, 2)) + End Function - 'find out the CVC pattern - pattern = returnCVCpattern(str) + Public Function GenerateSummary(ByVal text As String, ByVal entities As List(Of String), ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As String + ' Extract important sentences with context + Dim importantSentences As List(Of String) = ExtractImportantSentencesInText(text, entities, numContextSentencesBefore, numContextSentencesAfter) - 'check to see if the return pattern contains a vowel - porterContainsVowel = If(InStr(pattern, "v") = 0, False, True) - Else - porterContainsVowel = False - End If + ' Generate the summary + Dim summary As String = String.Join(". ", importantSentences) + Return summary End Function - Private Shared Function porterCountm(str As String) As Byte + ''' + ''' Searches for important sentences in text , identified by the presence of an entity from this list + ''' These lists can be specific to a particular topic or entity or a search query + ''' + ''' + ''' Entity list + ''' + ''' + ''' + Public Function ExtractImportantSentencesInText(ByRef Text As String, + EntityList As List(Of String), + Optional WithContext As Boolean = False, + Optional NumberOfContextSentences As Integer = 0) As List(Of String) + Dim Sents As New List(Of String) - On Error Resume Next + Select Case WithContext + Case False - 'A \consonant\ in a word is a letter other than A, E, I, O or U, and other - 'than Y preceded by a consonant. (The fact that the term `consonant' is - 'defined to some extent in terms of itself does not make it ambiguous.) So in - 'TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a - 'letter is not a consonant it is a \vowel\. + For Each Sent In Split(Text, ".") + For Each Entity In EntityList + If Sent.Contains(Entity) Then + Sents.Add(Sent) + End If + Next - 'declaring local variables - Dim chars() As Byte - Dim const_vowel As String - Dim i As Byte - Dim m As Byte - Dim flag As Boolean - Dim pattern As String + Next + Return Sents.Distinct.ToList + Case True - 'initializing - const_vowel = "" - m = 0 - flag = False + For Each Sent In Split(Text, ".") + For Each Entity In EntityList + If Sent.ToLower.Contains(Entity.ToLower) Then + Sents.AddRange(ExtractContextSentences(Text, Sent, NumberOfContextSentences)) + End If + Next - If Not Len(str) = 0 Then + Next + Return Sents.Distinct.ToList + End Select - 'find out the CVC pattern - pattern = returnCVCpattern(str) + Return Sents.Distinct.ToList + End Function - 'converting const_vowel to byte array - chars = System.Text.Encoding.Unicode.GetBytes(pattern) + ''' + ''' grabs important sentences from text based on the entity list provided . + ''' (values or terms or noun phrases or verb phrases) as this is a sentence level search + ''' it also grabs the context sentences surrounding it based on the inputs + ''' + ''' + ''' + ''' + ''' + ''' + Public Function ExtractImportantSentencesInText(ByVal text As String, ByVal entityList As List(Of String), ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As List(Of String) + Dim importantSentences As New List(Of String) - 'counting the number of m's... - For i = 0 To UBound(chars) Step 1 - If Chr(chars(i)) = "v" Or flag = True Then - flag = True - If Chr(chars(i)) = "c" Then - m = m + 1 - flag = False - End If + For Each sentence In text.Split("."c) + For Each entity In entityList + If sentence.ToLower.Contains(entity.ToLower) Then + ' Add the current sentence and the context sentences + importantSentences.AddRange(ExtractContextSentences(text, sentence, numContextSentencesBefore, numContextSentencesAfter)) + Exit For ' Break out of the inner loop if the entity is found in the sentence End If - Next i - - End If - - porterCountm = m + Next + Next + Return importantSentences.Distinct().ToList() End Function - Private Shared Function porterEndsCVC(str As String) As Boolean + ''' + ''' Gets important Sentences in text with or without context + ''' + ''' + ''' + ''' + ''' + ''' + ''' + Public Function ExtractImportantSentencesInText(ByRef Text As String, EntityList As List(Of String), Optional WithContext As Boolean = False, + Optional NumberOfContextSentencesBefore As Integer = 0, + Optional NumberOfContextSentencesAfter As Integer = 0) As List(Of String) + Dim importantSentences As New List(Of String) - On Error Resume Next + For Each sentence In Split(Text, ".") + For Each entity In EntityList + If sentence.ToLower.Contains(entity.ToLower) Then + importantSentences.Add(sentence) + Exit For ' Break out of the inner loop if the entity is found in the sentence + End If + Next + Next - '*o - the stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP). + If WithContext Then + Dim sentencesWithContext As New List(Of String) + For Each sentence In importantSentences + sentencesWithContext.AddRange(ExtractContextSentences(Text, sentence, NumberOfContextSentencesBefore, NumberOfContextSentencesAfter)) + Next + Return sentencesWithContext + Else + Return importantSentences + End If + End Function - 'declaring local variables - Dim chars() As Byte - Dim const_vowel As String - Dim i As Byte - Dim pattern As String + ''' + ''' Given an important Sentence Extract its surrounding context Sentences + ''' + ''' + ''' Important Sentence to match + ''' Number of Sentences Either Side + ''' + Public Function ExtractContextSentences(ByRef Text As String, ByRef ImportantSentence As String, ByRef ConTextInt As Integer) As List(Of String) + Dim ContextSentences As New List(Of String) + Dim CurrentSentences As New List(Of String) + Dim Count As Integer = 0 - 'check to see if atleast 3 characters are present - If Len(str) >= 3 Then + For Each Sent In Split(Text, ".") + CurrentSentences.Add(Sent) + Count += 1 + If Sent = ImportantSentence Then + 'Get Previous sentences - 'converting string to byte array + For i = 0 To ConTextInt + Dim Index = Count - 1 + If Index >= 0 Or Index < CurrentSentences.Count Then - chars = System.Text.Encoding.Unicode.GetBytes(str) + ContextSentences.Add(CurrentSentences(Index)) - 'find out the CVC pattern - pattern = returnCVCpattern(str) + End If + Next + ContextSentences.Add(ImportantSentence) + 'GetFollowing Sentences + For i = 0 To ConTextInt + If Count + i < CurrentSentences.Count Then + ContextSentences.Add(CurrentSentences(Count + i)) + End If + Next + End If + Next + Return ContextSentences + End Function - 'we need to check only the last three characters - pattern = Right(pattern, 3) + ''' + ''' Given an important Sentence Extract its surrounding context Sentences - + ''' In some cases it may be prudent to grab only a single sentence before and multiple sentences after + ''' important to know which context is important in which instance + ''' + ''' Document + ''' Sentence to be matched + ''' number of + ''' number of + ''' + Public Function ExtractContextSentences(ByVal text As String, ByVal importantSentence As String, ByVal numContextSentencesBefore As Integer, ByVal numContextSentencesAfter As Integer) As List(Of String) + Dim contextSentences As New List(Of String) + Dim allSentences As List(Of String) = text.Split("."c).ToList() + Dim sentenceIndex As Integer = allSentences.IndexOf(importantSentence) - 'check to see if the letters in str match the sequence cvc - porterEndsCVC = If(pattern = "cvc", If(Not (Chr(chars(UBound(chars))) = "w" Or Chr(chars(UBound(chars))) = "x" Or Chr(chars(UBound(chars))) = "y"), True, False), False) - Else + ' Get sentences before the important sentence + Dim startIndex As Integer = Math.Max(0, sentenceIndex - numContextSentencesBefore) + For i = startIndex To sentenceIndex - 1 + contextSentences.Add(allSentences(i)) + Next - porterEndsCVC = False + ' Add the important sentence + contextSentences.Add(importantSentence) - End If + ' Get sentences after the important sentence + Dim endIndex As Integer = Math.Min(sentenceIndex + numContextSentencesAfter, allSentences.Count - 1) + For i = sentenceIndex + 1 To endIndex + contextSentences.Add(allSentences(i)) + Next + Return contextSentences End Function - Private Shared Function porterEndsDoubleConsonent(str As String) As Boolean + Public Function GenerateTextFromEntities(entities As List(Of String), storedSentences As List(Of String)) As String + ' Implement your custom text generation logic here + ' Generate text using the entities and stored sentences - On Error Resume Next + Dim generatedText As String = "" - 'checking whether word ends with a double consonant (e.g. -TT, -SS). + ' Example text generation logic + For Each entity As String In entities + Dim matchingSentences As List(Of String) = FindSentencesWithEntity(entity, storedSentences) - 'declaring local variables - Dim holds_ends As String - Dim hold_third_last As String - Dim chars() As Byte + ' Randomly select a sentence from the matching sentences + Dim random As New Random() + Dim selectedSentence As String = matchingSentences(random.Next(0, matchingSentences.Count)) - 'first check whether the size of the word is >= 2 - If Len(str) >= 2 Then + ' Replace the entity tag with the actual entity in the selected sentence + Dim generatedSentence As String = selectedSentence.Replace(entity, "<<" & entity & ">>") - 'extract 2 characters from right of str - holds_ends = Right(str, 2) + ' Append the generated sentence to the generated text + generatedText &= generatedSentence & " " + Next - 'converting string to byte array - chars = System.Text.Encoding.Unicode.GetBytes(holds_ends) + Return generatedText.Trim() + End Function - 'checking if both the characters are same - If chars(0) = chars(1) Then + Public Function FindSentencesWithEntity(entity As String, storedSentences As List(Of String)) As List(Of String) + ' Implement your custom logic to find sentences that contain the given entity + ' Return a list of sentences that match the entity - 'check for double consonent - If holds_ends = "aa" Or holds_ends = "ee" Or holds_ends = "ii" Or holds_ends = "oo" Or holds_ends = "uu" Then + Dim matchingSentences As New List(Of String) - porterEndsDoubleConsonent = False - Else + ' Example logic: Check if the entity appears in each stored sentence + For Each sentence As String In storedSentences + If sentence.Contains(entity) Then + matchingSentences.Add(sentence) + End If + Next - 'if the second last character is y, and there are atleast three letters in str - If holds_ends = "yy" And Len(str) > 2 Then + Return matchingSentences + End Function - 'extracting the third last character - hold_third_last = Right(str, 3) - hold_third_last = Left(str, 1) + End Class + ''' + ''' TO USE THE PROGRAM CALL THE FUNCTION PORTERALGORITHM. THE WORD + ''' TO BE STEMMED SHOULD BE PASSED AS THE ARGUEMENT ARGUEMENT. THE STRING + ''' RETURNED BY THE FUNCTION IS THE STEMMED WORD + ''' Porter Stemmer. It follows the algorithm definition + ''' presented in : + ''' Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, + ''' no. 3, pp 130-137, + ''' + Public Class WordStemmer - porterEndsDoubleConsonent = If(Not (hold_third_last = "a" Or hold_third_last = "e" Or hold_third_last = "i" Or hold_third_last = "o" Or hold_third_last = "u"), False, True) - Else + ' (http://www.tartarus.org/~martin/PorterStemmer) - porterEndsDoubleConsonent = True + 'Author : Navonil Mustafee + 'Brunel University - student + 'Algorithm Implemented as part for assignment on document visualization - End If + Public Shared Function StemWord(str As String) As String - End If - Else + 'only strings greater than 2 are stemmed + If Len(Trim(str)) > 0 Then + str = porterAlgorithmStep1(str) + str = porterAlgorithmStep2(str) + str = porterAlgorithmStep3(str) + str = porterAlgorithmStep4(str) + str = porterAlgorithmStep5(str) + End If - porterEndsDoubleConsonent = False + 'End of Porter's algorithm.........returning the word + StemWord = str - End If - Else + End Function - porterEndsDoubleConsonent = False + Private Shared Function porterAlgorithmStep1(str As String) As String - End If + On Error Resume Next - End Function + 'STEP 1A + ' + ' SSES -> SS caresses -> caress + ' IES -> I ponies -> poni + ' ties -> ti + ' SS -> SS caress -> caress + ' S -> cats -> cat - Private Shared Function porterEndsWith(str As String, ends As String) As Boolean + 'declaring local variables + Dim i As Byte + Dim j As Byte + Dim step1a(3, 1) As String - On Error Resume Next + 'initializing contents of 2D array + step1a(0, 0) = "sses" + step1a(0, 1) = "ss" + step1a(1, 0) = "ies" + step1a(1, 1) = "i" + step1a(2, 0) = "ss" + step1a(2, 1) = "ss" + step1a(3, 0) = "s" + step1a(3, 1) = "" + + 'checking word + For i = 0 To 3 Step 1 + If porterEndsWith(str, step1a(i, 0)) Then + str = porterTrimEnd(str, Len(step1a(i, 0))) + str = porterAppendEnd(str, step1a(i, 1)) + Exit For + End If + Next i + + '-------------------------------------------------------------------------------------------------------- + + 'STEP 1B + ' + ' If + ' (m>0) EED -> EE feed -> feed + ' agreed -> agree + ' Else + ' (*v*) ED -> plastered -> plaster + ' bled -> bled + ' (*v*) ING -> motoring -> motor + ' sing -> sing + ' + 'If the second or third of the rules in Step 1b is successful, the following + 'is done: + ' + ' AT -> ATE conflat(ed) -> conflate + ' BL -> BLE troubl(ed) -> trouble + ' IZ -> IZE siz(ed) -> size + ' (*d and not (*L or *S or *Z)) + ' -> single letter + ' hopp(ing) -> hop + ' tann(ed) -> tan + ' fall(ing) -> fall + ' hiss(ing) -> hiss + ' fizz(ed) -> fizz + ' (m=1 and *o) -> E fail(ing) -> fail + ' fil(ing) -> file + ' + 'The rule to map to a single letter causes the removal of one of the double + 'letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes + '-ATE, -BLE and -IZE can be recognised later. This E may be removed in step + '4. 'declaring local variables - Dim length_str As Byte - Dim length_ends As Byte - Dim hold_ends As String + Dim m As Byte + Dim temp As String + Dim second_third_success As Boolean - 'finding the length of the string - length_str = Len(str) - length_ends = Len(ends) + 'initializing contents of 2D array + second_third_success = False - 'if length of str is greater than the length of length_ends, only then proceed..else return false - If length_ends >= length_str Then + '(m>0) EED -> EE..else..(*v*) ED ->(*v*) ING -> + If porterEndsWith(str, "eed") Then - porterEndsWith = False - Else + 'counting the number of m's + temp = porterTrimEnd(str, Len("eed")) + m = porterCountm(temp) - 'extract characters from right of str - hold_ends = Right(str, length_ends) + If m > 0 Then + str = porterTrimEnd(str, Len("eed")) + str = porterAppendEnd(str, "ee") + End If - 'comparing to see whether hold_ends=ends - porterEndsWith = If(StrComp(hold_ends, ends) = 0, True, False) + ElseIf porterEndsWith(str, "ed") Then - End If + 'trim and check for vowel + temp = porterTrimEnd(str, Len("ed")) - End Function + If porterContainsVowel(temp) Then + str = porterTrimEnd(str, Len("ed")) + second_third_success = True + End If - Private Shared Function porterTrimEnd(str As String, length As Byte) As String + ElseIf porterEndsWith(str, "ing") Then - On Error Resume Next + 'trim and check for vowel + temp = porterTrimEnd(str, Len("ing")) - 'returning the trimmed string - porterTrimEnd = Left(str, Len(str) - length) + If porterContainsVowel(temp) Then + str = porterTrimEnd(str, Len("ing")) + second_third_success = True + End If - End Function + End If - Private Shared Function returnCVCpattern(str As String) As String + 'If the second or third of the rules in Step 1b is SUCCESSFUL, the following + 'is done: + ' + ' AT -> ATE conflat(ed) -> conflate + ' BL -> BLE troubl(ed) -> trouble + ' IZ -> IZE siz(ed) -> size + ' (*d and not (*L or *S or *Z)) + ' -> single letter + ' hopp(ing) -> hop + ' tann(ed) -> tan + ' fall(ing) -> fall + ' hiss(ing) -> hiss + ' fizz(ed) -> fizz + ' (m=1 and *o) -> E fail(ing) -> fail + ' fil(ing) -> file - 'local variables - Dim chars() As Byte - Dim const_vowel As String = "" - Dim i As Byte + If second_third_success = True Then 'If the second or third of the rules in Step 1b is SUCCESSFUL - 'converting string to byte array - chars = System.Text.Encoding.Unicode.GetBytes(str) + If porterEndsWith(str, "at") Then 'AT -> ATE + str = porterTrimEnd(str, Len("at")) + str = porterAppendEnd(str, "ate") + ElseIf porterEndsWith(str, "bl") Then 'BL -> BLE + str = porterTrimEnd(str, Len("bl")) + str = porterAppendEnd(str, "ble") + ElseIf porterEndsWith(str, "iz") Then 'IZ -> IZE + str = porterTrimEnd(str, Len("iz")) + str = porterAppendEnd(str, "ize") + ElseIf porterEndsDoubleConsonent(str) Then '(*d and not (*L or *S or *Z))-> single letter + If Not (porterEndsWith(str, "l") Or porterEndsWith(str, "s") Or porterEndsWith(str, "z")) Then + str = porterTrimEnd(str, 1) + End If + ElseIf porterCountm(str) = 1 Then '(m=1 and *o) -> E + If porterEndsCVC(str) Then + str = porterAppendEnd(str, "e") + End If + End If - 'checking each character to see if it is a consonent or a vowel. also inputs the information in const_vowel - For i = 0 To UBound(chars) Step 1 + End If - If Chr(chars(i)) = "a" Or Chr(chars(i)) = "e" Or Chr(chars(i)) = "i" Or Chr(chars(i)) = "o" Or Chr(chars(i)) = "u" Then - const_vowel = const_vowel + "v" - ElseIf Chr(chars(i)) = "y" Then - 'if y is not the first character, only then check the previous character - 'check to see if previous character is a consonent - const_vowel = If(i > 0, If(Not (Chr(chars(i - 1)) = "a" Or Chr(chars(i - 1)) = "e" Or Chr(chars(i - 1)) = "i" Or Chr(chars(i - 1)) = "o" Or Chr(chars(i - 1)) = "u"), const_vowel + "v", const_vowel + "c"), const_vowel + "c") - Else - const_vowel = const_vowel + "c" + '-------------------------------------------------------------------------------------------------------- + ' + 'STEP 1C + ' + ' (*v*) Y -> I happy -> happi + ' sky -> sky + + If porterEndsWith(str, "y") Then + + 'trim and check for vowel + temp = porterTrimEnd(str, 1) + + If porterContainsVowel(temp) Then + str = porterTrimEnd(str, Len("y")) + str = porterAppendEnd(str, "i") End If - Next i + End If - returnCVCpattern = const_vowel + 'retuning the word + porterAlgorithmStep1 = str End Function - End Class - ''' - ''' The removal of commonly used words which are only used to create a sentence such as, - ''' the, on, in of, but - ''' - Public Class RemoveStopWords + Private Shared Function porterAlgorithmStep2(str As String) As String - Public StopWords As New List(Of String) + On Error Resume Next - Public StopWordsArab() As String = {"،", "آض", "آمينَ", "آه", - "آهاً", "آي", "أ", "أب", "أجل", "أجمع", "أخ", "أخذ", "أصبح", "أضحى", "أقبل", - "أقل", "أكثر", "ألا", "أم", "أما", "أمامك", "أمامكَ", "أمسى", "أمّا", "أن", "أنا", "أنت", - "أنتم", "أنتما", "أنتن", "أنتِ", "أنشأ", "أنّى", "أو", "أوشك", "أولئك", "أولئكم", "أولاء", - "أولالك", "أوّهْ", "أي", "أيا", "أين", "أينما", "أيّ", "أَنَّ", "أََيُّ", "أُفٍّ", "إذ", "إذا", "إذاً", - "إذما", "إذن", "إلى", "إليكم", "إليكما", "إليكنّ", "إليكَ", "إلَيْكَ", "إلّا", "إمّا", "إن", "إنّما", - "إي", "إياك", "إياكم", "إياكما", "إياكن", "إيانا", "إياه", "إياها", "إياهم", "إياهما", "إياهن", - "إياي", "إيهٍ", "إِنَّ", "ا", "ابتدأ", "اثر", "اجل", "احد", "اخرى", "اخلولق", "اذا", "اربعة", "ارتدّ", - "استحال", "اطار", "اعادة", "اعلنت", "اف", "اكثر", "اكد", "الألاء", "الألى", "الا", "الاخيرة", "الان", "الاول", - "الاولى", "التى", "التي", "الثاني", "الثانية", "الذاتي", "الذى", "الذي", "الذين", "السابق", "الف", "اللائي", - "اللاتي", "اللتان", "اللتيا", "اللتين", "اللذان", "اللذين", "اللواتي", "الماضي", "المقبل", "الوقت", "الى", - "اليوم", "اما", "امام", "امس", "ان", "انبرى", "انقلب", "انه", "انها", "او", "اول", "اي", "ايار", "ايام", - "ايضا", "ب", "بات", "باسم", "بان", "بخٍ", "برس", "بسبب", "بسّ", "بشكل", "بضع", "بطآن", "بعد", "بعض", "بك", - "بكم", "بكما", "بكن", "بل", "بلى", "بما", "بماذا", "بمن", "بن", "بنا", "به", "بها", "بي", "بيد", "بين", - "بَسْ", "بَلْهَ", "بِئْسَ", "تانِ", "تانِك", "تبدّل", "تجاه", "تحوّل", "تلقاء", "تلك", "تلكم", "تلكما", "تم", "تينك", - "تَيْنِ", "تِه", "تِي", "ثلاثة", "ثم", "ثمّ", "ثمّة", "ثُمَّ", "جعل", "جلل", "جميع", "جير", "حار", "حاشا", "حاليا", - "حاي", "حتى", "حرى", "حسب", "حم", "حوالى", "حول", "حيث", "حيثما", "حين", "حيَّ", "حَبَّذَا", "حَتَّى", "حَذارِ", "خلا", - "خلال", "دون", "دونك", "ذا", "ذات", "ذاك", "ذانك", "ذانِ", "ذلك", "ذلكم", "ذلكما", "ذلكن", "ذو", "ذوا", "ذواتا", "ذواتي", "ذيت", "ذينك", "ذَيْنِ", "ذِه", "ذِي", "راح", "رجع", "رويدك", "ريث", "رُبَّ", "زيارة", "سبحان", "سرعان", "سنة", "سنوات", "سوف", "سوى", "سَاءَ", "سَاءَمَا", "شبه", "شخصا", "شرع", "شَتَّانَ", "صار", "صباح", "صفر", "صهٍ", "صهْ", "ضد", "ضمن", "طاق", "طالما", "طفق", "طَق", "ظلّ", "عاد", "عام", "عاما", "عامة", "عدا", "عدة", "عدد", "عدم", "عسى", "عشر", "عشرة", "علق", "على", "عليك", "عليه", "عليها", "علًّ", "عن", "عند", "عندما", "عوض", "عين", "عَدَسْ", "عَمَّا", "غدا", "غير", "ـ", "ف", "فان", "فلان", "فو", "فى", "في", "فيم", "فيما", "فيه", "فيها", "قال", "قام", "قبل", "قد", "قطّ", "قلما", "قوة", "كأنّما", "كأين", "كأيّ", "كأيّن", "كاد", "كان", "كانت", "كذا", "كذلك", "كرب", "كل", "كلا", "كلاهما", "كلتا", "كلم", "كليكما", "كليهما", "كلّما", "كلَّا", "كم", "كما", "كي", "كيت", "كيف", "كيفما", "كَأَنَّ", "كِخ", "لئن", "لا", "لات", "لاسيما", "لدن", "لدى", "لعمر", "لقاء", "لك", "لكم", "لكما", "لكن", "لكنَّما", "لكي", "لكيلا", "للامم", "لم", "لما", "لمّا", "لن", "لنا", "له", "لها", "لو", "لوكالة", "لولا", "لوما", "لي", "لَسْتَ", "لَسْتُ", "لَسْتُم", "لَسْتُمَا", "لَسْتُنَّ", "لَسْتِ", "لَسْنَ", "لَعَلَّ", "لَكِنَّ", "لَيْتَ", "لَيْسَ", "لَيْسَا", "لَيْسَتَا", "لَيْسَتْ", "لَيْسُوا", "لَِسْنَا", "ما", "ماانفك", "مابرح", "مادام", "ماذا", "مازال", "مافتئ", "مايو", "متى", "مثل", "مذ", "مساء", "مع", "معاذ", "مقابل", "مكانكم", "مكانكما", "مكانكنّ", "مكانَك", "مليار", "مليون", "مما", "ممن", "من", "منذ", "منها", "مه", "مهما", "مَنْ", "مِن", "نحن", "نحو", "نعم", "نفس", "نفسه", "نهاية", "نَخْ", "نِعِمّا", "نِعْمَ", "ها", "هاؤم", "هاكَ", "هاهنا", "هبّ", "هذا", "هذه", "هكذا", "هل", "هلمَّ", "هلّا", "هم", "هما", "هن", "هنا", "هناك", "هنالك", "هو", "هي", "هيا", "هيت", "هيّا", "هَؤلاء", "هَاتانِ", "هَاتَيْنِ", "هَاتِه", "هَاتِي", "هَجْ", "هَذا", "هَذانِ", "هَذَيْنِ", "هَذِه", "هَذِي", "هَيْهَاتَ", "و", "و6", "وا", "واحد", "واضاف", "واضافت", "واكد", "وان", "واهاً", "واوضح", "وراءَك", "وفي", "وقال", "وقالت", "وقد", "وقف", "وكان", "وكانت", "ولا", "ولم", - "ومن", "وهو", "وهي", "ويكأنّ", "وَيْ", "وُشْكَانََ", "يكون", "يمكن", "يوم", "ّأيّان"} - - Public StopWordsDutch() As String = {"aan", "achte", "achter", "af", "al", "alle", "alleen", "alles", "als", "ander", "anders", "beetje", - "behalve", "beide", "beiden", "ben", "beneden", "bent", "bij", "bijna", "bijv", "blijkbaar", "blijken", "boven", "bv", - "daar", "daardoor", "daarin", "daarna", "daarom", "daaruit", "dan", "dat", "de", "deden", "deed", "derde", "derhalve", "dertig", - "deze", "dhr", "die", "dit", "doe", "doen", "doet", "door", "drie", "duizend", "echter", "een", "eens", "eerst", "eerste", "eigen", - "eigenlijk", "elk", "elke", "en", "enige", "er", "erg", "ergens", "etc", "etcetera", "even", "geen", "genoeg", "geweest", "haar", - "haarzelf", "had", "hadden", "heb", "hebben", "hebt", "hedden", "heeft", "heel", "hem", "hemzelf", "hen", "het", "hetzelfde", - "hier", "hierin", "hierna", "hierom", "hij", "hijzelf", "hoe", "honderd", "hun", "ieder", "iedere", "iedereen", "iemand", "iets", - "ik", "in", "inderdaad", "intussen", "is", "ja", "je", "jij", "jijzelf", "jou", "jouw", "jullie", "kan", "kon", "konden", "kun", - "kunnen", "kunt", "laatst", "later", "lijken", "lijkt", "maak", "maakt", "maakte", "maakten", "maar", "mag", "maken", "me", "meer", - "meest", "meestal", "men", "met", "mevr", "mij", "mijn", "minder", "miss", "misschien", "missen", "mits", "mocht", "mochten", - "moest", "moesten", "moet", "moeten", "mogen", "mr", "mrs", "mw", "na", "naar", "nam", "namelijk", "nee", "neem", "negen", - "nemen", "nergens", "niemand", "niet", "niets", "niks", "noch", "nochtans", "nog", "nooit", "nu", "nv", "of", "om", "omdat", - "ondanks", "onder", "ondertussen", "ons", "onze", "onzeker", "ooit", "ook", "op", "over", "overal", "overige", "paar", "per", - "recent", "redelijk", "samen", "sinds", "steeds", "te", "tegen", "tegenover", "thans", "tien", "tiende", "tijdens", "tja", "toch", - "toe", "tot", "totdat", "tussen", "twee", "tweede", "u", "uit", "uw", "vaak", "van", "vanaf", "veel", "veertig", "verder", - "verscheidene", "verschillende", "via", "vier", "vierde", "vijf", "vijfde", "vijftig", "volgend", "volgens", "voor", "voordat", - "voorts", "waar", "waarom", "waarschijnlijk", "wanneer", "waren", "was", "wat", "we", "wederom", "weer", "weinig", "wel", "welk", - "welke", "werd", "werden", "werder", "whatever", "wie", "wij", "wijzelf", "wil", "wilden", "willen", "word", "worden", "wordt", "zal", - "ze", "zei", "zeker", "zelf", "zelfde", "zes", "zeven", "zich", "zij", "zijn", "zijzelf", "zo", "zoals", "zodat", "zou", "zouden", - "zulk", "zullen"} + 'STEP 2 + ' + ' (m>0) ATIONAL -> ATE relational -> relate + ' (m>0) TIONAL -> TION conditional -> condition + ' rational -> rational + ' (m>0) ENCI -> ENCE valenci -> valence + ' (m>0) ANCI -> ANCE hesitanci -> hesitance + ' (m>0) IZER -> IZE digitizer -> digitize + 'Also, + ' (m>0) BLI -> BLE conformabli -> conformable + ' + ' (m>0) ALLI -> AL radicalli -> radical + ' (m>0) ENTLI -> ENT differentli -> different + ' (m>0) ELI -> E vileli - > vile + ' (m>0) OUSLI -> OUS analogousli -> analogous + ' (m>0) IZATION -> IZE vietnamization -> vietnamize + ' (m>0) ATION -> ATE predication -> predicate + ' (m>0) ATOR -> ATE operator -> operate + ' (m>0) ALISM -> AL feudalism -> feudal + ' (m>0) IVENESS -> IVE decisiveness -> decisive + ' (m>0) FULNESS -> FUL hopefulness -> hopeful + ' (m>0) OUSNESS -> OUS callousness -> callous + ' (m>0) ALITI -> AL formaliti -> formal + ' (m>0) IVITI -> IVE sensitiviti -> sensitive + ' (m>0) BILITI -> BLE sensibiliti -> sensible + 'Also, + ' (m>0) LOGI -> LOG apologi -> apolog + ' + 'The test for the string S1 can be made fast by doing a program switch on + 'the penultimate letter of the word being tested. This gives a fairly even + 'breakdown of the possible values of the string S1. It will be seen in fact + 'that the S1-strings in step 2 are presented here in the alphabetical order + 'of their penultimate letter. Similar techniques may be applied in the other + 'steps. - Public StopWordsENG() As String = {"a", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", - "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", - "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", - "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", - "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", - "but", "by", "c", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", - "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", - "course", "currently", "d", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", - "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", - "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", - "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", - "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", - "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", - "hither", "hopefully", "how", "howbeit", "however", "i", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", - "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "j", - "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", - "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", - "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", - "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", - "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", - "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", - "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", - "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", - "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", - "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", - "still", "sub", "such", "sup", "sure", "t", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", - "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", - "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", - "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", - "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", - "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", - "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", - "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", - "with", "within", "without", "wont", "wonder", "would", "wouldnt", "x", "y", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", - "yourself", "yourselves", "youll", "z", "zero"} + 'declaring local variables + Dim step2(20, 1) As String + Dim i As Byte + Dim temp As String - Public StopWordsFrench() As String = {"a", "abord", "absolument", "afin", "ah", "ai", "aie", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", - "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", - "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "auraient", "aurait", "auront", "aussi", "autre", "autrefois", "autrement", - "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "avons", "ayant", "b", - "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", - "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", - "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", - "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", - "clac", "clic", "combien", "comme", "comment", "comparable", "comparables", "compris", "concernant", "contre", "couic", "crac", "d", - "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", - "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", - "different", "differentes", "differents", "différent", "différente", "différentes", "différents", "dire", "directe", "directement", - "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", - "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "dès", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", - "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "est", "et", "etant", "etc", - "etre", "eu", "euh", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "f", "fais", "faisaient", "faisant", "fait", - "façon", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", - "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", - "juste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", - "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m", "ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", - "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "minimale", "moi", "moi-meme", "moi-même", - "moindres", "moins", "mon", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", - "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notamment", "notre", - "nous", "nous-mêmes", "nouveau", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", - "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "o|", "où", "p", "paf", "pan", "par", "parce", - "parfois", "parle", "parlent", "parler", "parmi", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas", - "passé", "pendant", "pense", "permet", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "plein", - "plouf", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", - "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", - "procedant", "proche", "près", "psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", - "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", - "quelqu'un", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", - "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent", "restrictif", "retour", "revoici", - "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", - "semble", "semblent", "sent", "sept", "septième", "sera", "seraient", "serait", "seront", "ses", "seul", "seule", "seulement", "si", - "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "souvent", - "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", - "suivant", "suivante", "suivantes", "suivants", "suivre", "superpose", "sur", "surtout", "t", "ta", "tac", "tant", "tardive", "te", - "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir", "tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", - "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", - "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", - "uniques", "uns", "v", "va", "vais", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", - "vont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", - "étais", "était", "étant", "été", "être", "ô"} + 'initializing contents of 2D array + step2(0, 0) = "ational" + step2(0, 1) = "ate" + step2(1, 0) = "tional" + step2(1, 1) = "tion" + step2(2, 0) = "enci" + step2(2, 1) = "ence" + step2(3, 0) = "anci" + step2(3, 1) = "ance" + step2(4, 0) = "izer" + step2(4, 1) = "ize" + step2(5, 0) = "bli" + step2(5, 1) = "ble" + step2(6, 0) = "alli" + step2(6, 1) = "al" + step2(7, 0) = "entli" + step2(7, 1) = "ent" + step2(8, 0) = "eli" + step2(8, 1) = "e" + step2(9, 0) = "ousli" + step2(9, 1) = "ous" + step2(10, 0) = "ization" + step2(10, 1) = "ize" + step2(11, 0) = "ation" + step2(11, 1) = "ate" + step2(12, 0) = "ator" + step2(12, 1) = "ate" + step2(13, 0) = "alism" + step2(13, 1) = "al" + step2(14, 0) = "iveness" + step2(14, 1) = "ive" + step2(15, 0) = "fulness" + step2(15, 1) = "ful" + step2(16, 0) = "ousness" + step2(16, 1) = "ous" + step2(17, 0) = "aliti" + step2(17, 1) = "al" + step2(18, 0) = "iviti" + step2(18, 1) = "ive" + step2(19, 0) = "biliti" + step2(19, 1) = "ble" + step2(20, 0) = "logi" + step2(20, 1) = "log" - Public StopWordsItalian() As String = {"IE", "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti", "ad", "adesso", "affinche", "agl", "agli", - "ahime", "ahimè", "ai", "al", "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora", "altri", "altrimenti", "altro", - "altrove", "altrui", "anche", "ancora", "anni", "anno", "ansa", "anticipo", "assai", "attesa", "attraverso", "avanti", "avemmo", - "avendo", "avente", "aver", "avere", "averlo", "avesse", "avessero", "avessi", "avessimo", "aveste", "avesti", "avete", "aveva", - "avevamo", "avevano", "avevate", "avevi", "avevo", "avrai", "avranno", "avrebbe", "avrebbero", "avrei", "avremmo", "avremo", - "avreste", "avresti", "avrete", "avrà", "avrò", "avuta", "avute", "avuti", "avuto", "basta", "bene", "benissimo", "berlusconi", - "brava", "bravo", "c", "casa", "caso", "cento", "certa", "certe", "certi", "certo", "che", "chi", "chicchessia", "chiunque", "ci", - "ciascuna", "ciascuno", "cima", "cio", "cioe", "cioè", "circa", "citta", "città", "ciò", "co", "codesta", "codesti", "codesto", - "cogli", "coi", "col", "colei", "coll", "coloro", "colui", "come", "cominci", "comunque", "con", "concernente", "conciliarsi", - "conclusione", "consiglio", "contro", "cortesia", "cos", "cosa", "cosi", "così", "cui", "d", "da", "dagl", "dagli", "dai", "dal", - "dall", "dalla", "dalle", "dallo", "dappertutto", "davanti", "degl", "degli", "dei", "del", "dell", "della", "delle", "dello", - "dentro", "detto", "deve", "di", "dice", "dietro", "dire", "dirimpetto", "diventa", "diventare", "diventato", "dopo", "dov", "dove", - "dovra", "dovrà", "dovunque", "due", "dunque", "durante", "e", "ebbe", "ebbero", "ebbi", "ecc", "ecco", "ed", "effettivamente", "egli", - "ella", "entrambi", "eppure", "era", "erano", "eravamo", "eravate", "eri", "ero", "esempio", "esse", "essendo", "esser", "essere", - "essi", "ex", "fa", "faccia", "facciamo", "facciano", "facciate", "faccio", "facemmo", "facendo", "facesse", "facessero", "facessi", - "facessimo", "faceste", "facesti", "faceva", "facevamo", "facevano", "facevate", "facevi", "facevo", "fai", "fanno", "farai", - "faranno", "fare", "farebbe", "farebbero", "farei", "faremmo", "faremo", "fareste", "faresti", "farete", "farà", "farò", "fatto", - "favore", "fece", "fecero", "feci", "fin", "finalmente", "finche", "fine", "fino", "forse", "forza", "fosse", "fossero", "fossi", - "fossimo", "foste", "fosti", "fra", "frattempo", "fu", "fui", "fummo", "fuori", "furono", "futuro", "generale", "gia", "giacche", - "giorni", "giorno", "già", "gli", "gliela", "gliele", "glieli", "glielo", "gliene", "governo", "grande", "grazie", "gruppo", "ha", - "haha", "hai", "hanno", "ho", "i", "ieri", "il", "improvviso", "in", "inc", "infatti", "inoltre", "insieme", "intanto", "intorno", - "invece", "io", "l", "la", "lasciato", "lato", "lavoro", "le", "lei", "li", "lo", "lontano", "loro", "lui", "lungo", "luogo", "là", - "ma", "macche", "magari", "maggior", "mai", "male", "malgrado", "malissimo", "mancanza", "marche", "me", "medesimo", "mediante", - "meglio", "meno", "mentre", "mesi", "mezzo", "mi", "mia", "mie", "miei", "mila", "miliardi", "milioni", "minimi", "ministro", - "mio", "modo", "molti", "moltissimo", "molto", "momento", "mondo", "mosto", "nazionale", "ne", "negl", "negli", "nei", "nel", - "nell", "nella", "nelle", "nello", "nemmeno", "neppure", "nessun", "nessuna", "nessuno", "niente", "no", "noi", "non", "nondimeno", - "nonostante", "nonsia", "nostra", "nostre", "nostri", "nostro", "novanta", "nove", "nulla", "nuovo", "o", "od", "oggi", "ogni", - "ognuna", "ognuno", "oltre", "oppure", "ora", "ore", "osi", "ossia", "ottanta", "otto", "paese", "parecchi", "parecchie", - "parecchio", "parte", "partendo", "peccato", "peggio", "per", "perche", "perchè", "perché", "percio", "perciò", "perfino", "pero", - "persino", "persone", "però", "piedi", "pieno", "piglia", "piu", "piuttosto", "più", "po", "pochissimo", "poco", "poi", "poiche", - "possa", "possedere", "posteriore", "posto", "potrebbe", "preferibilmente", "presa", "press", "prima", "primo", "principalmente", - "probabilmente", "proprio", "puo", "pure", "purtroppo", "può", "qualche", "qualcosa", "qualcuna", "qualcuno", "quale", "quali", - "qualunque", "quando", "quanta", "quante", "quanti", "quanto", "quantunque", "quasi", "quattro", "quel", "quella", "quelle", - "quelli", "quello", "quest", "questa", "queste", "questi", "questo", "qui", "quindi", "realmente", "recente", "recentemente", - "registrazione", "relativo", "riecco", "salvo", "sara", "sarai", "saranno", "sarebbe", "sarebbero", "sarei", "saremmo", "saremo", - "sareste", "saresti", "sarete", "sarà", "sarò", "scola", "scopo", "scorso", "se", "secondo", "seguente", "seguito", "sei", "sembra", - "sembrare", "sembrato", "sembri", "sempre", "senza", "sette", "si", "sia", "siamo", "siano", "siate", "siete", "sig", "solito", - "solo", "soltanto", "sono", "sopra", "sotto", "spesso", "srl", "sta", "stai", "stando", "stanno", "starai", "staranno", "starebbe", - "starebbero", "starei", "staremmo", "staremo", "stareste", "staresti", "starete", "starà", "starò", "stata", "state", "stati", - "stato", "stava", "stavamo", "stavano", "stavate", "stavi", "stavo", "stemmo", "stessa", "stesse", "stessero", "stessi", "stessimo", - "stesso", "steste", "stesti", "stette", "stettero", "stetti", "stia", "stiamo", "stiano", "stiate", "sto", "su", "sua", "subito", - "successivamente", "successivo", "sue", "sugl", "sugli", "sui", "sul", "sull", "sulla", "sulle", "sullo", "suo", "suoi", "tale", - "tali", "talvolta", "tanto", "te", "tempo", "ti", "titolo", "torino", "tra", "tranne", "tre", "trenta", "troppo", "trovato", "tu", - "tua", "tue", "tuo", "tuoi", "tutta", "tuttavia", "tutte", "tutti", "tutto", "uguali", "ulteriore", "ultimo", "un", "una", "uno", - "uomo", "va", "vale", "vari", "varia", "varie", "vario", "verso", "vi", "via", "vicino", "visto", "vita", "voi", "volta", "volte", - "vostra", "vostre", "vostri", "vostro", "è"} + 'checking word + For i = 0 To 20 Step 1 + If porterEndsWith(str, step2(i, 0)) Then + temp = porterTrimEnd(str, Len(step2(i, 0))) + If porterCountm(temp) > 0 Then + str = porterTrimEnd(str, Len(step2(i, 0))) + str = porterAppendEnd(str, step2(i, 1)) + End If + Exit For + End If + Next i - Public StopWordsSpanish() As String = {"a", "actualmente", "acuerdo", "adelante", "ademas", "además", "adrede", "afirmó", "agregó", "ahi", "ahora", - "ahí", "al", "algo", "alguna", "algunas", "alguno", "algunos", "algún", "alli", "allí", "alrededor", "ambos", "ampleamos", - "antano", "antaño", "ante", "anterior", "antes", "apenas", "aproximadamente", "aquel", "aquella", "aquellas", "aquello", - "aquellos", "aqui", "aquél", "aquélla", "aquéllas", "aquéllos", "aquí", "arriba", "arribaabajo", "aseguró", "asi", "así", - "atras", "aun", "aunque", "ayer", "añadió", "aún", "b", "bajo", "bastante", "bien", "breve", "buen", "buena", "buenas", "bueno", - "buenos", "c", "cada", "casi", "cerca", "cierta", "ciertas", "cierto", "ciertos", "cinco", "claro", "comentó", "como", "con", - "conmigo", "conocer", "conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen", "consigues", - "contigo", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "cuatro", - "cuenta", "cuál", "cuáles", "cuándo", "cuánta", "cuántas", "cuánto", "cuántos", "cómo", "d", "da", "dado", "dan", "dar", "de", - "debajo", "debe", "deben", "debido", "decir", "dejó", "del", "delante", "demasiado", "demás", "dentro", "deprisa", "desde", - "despacio", "despues", "después", "detras", "detrás", "dia", "dias", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", - "dijeron", "dijo", "dio", "donde", "dos", "durante", "día", "días", "dónde", "e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", - "embargo", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "enfrente", "enseguida", "entonces", - "entre", "era", "eramos", "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos", "esta", "estaba", "estaban", "estado", - "estados", "estais", "estamos", "estan", "estar", "estará", "estas", "este", "esto", "estos", "estoy", "estuvo", "está", "están", "ex", - "excepto", "existe", "existen", "explicó", "expresó", "f", "fin", "final", "fue", "fuera", "fueron", "fui", "fuimos", "g", "general", - "gran", "grandes", "gueno", "h", "ha", "haber", "habia", "habla", "hablan", "habrá", "había", "habían", "hace", "haceis", "hacemos", - "hacen", "hacer", "hacerlo", "haces", "hacia", "haciendo", "hago", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", - "hizo", "horas", "hoy", "hubo", "i", "igual", "incluso", "indicó", "informo", "informó", "intenta", "intentais", "intentamos", "intentan", - "intentar", "intentas", "intento", "ir", "j", "junto", "k", "l", "la", "lado", "largo", "las", "le", "lejos", "les", "llegó", "lleva", - "llevar", "lo", "los", "luego", "lugar", "m", "mal", "manera", "manifestó", "mas", "mayor", "me", "mediante", "medio", "mejor", "mencionó", - "menos", "menudo", "mi", "mia", "mias", "mientras", "mio", "mios", "mis", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", - "muchas", "mucho", "muchos", "muy", "más", "mí", "mía", "mías", "mío", "míos", "n", "nada", "nadie", "ni", "ninguna", "ningunas", "ninguno", - "ningunos", "ningún", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", - "nuevos", "nunca", "o", "ocho", "os", "otra", "otras", "otro", "otros", "p", "pais", "para", "parece", "parte", "partir", "pasada", - "pasado", "paìs", "peor", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos", "poder", "podria", "podriais", - "podriamos", "podrian", "podrias", "podrá", "podrán", "podría", "podrían", "poner", "por", "porque", "posible", "primer", "primera", - "primero", "primeros", "principalmente", "pronto", "propia", "propias", "propio", "propios", "proximo", "próximo", "próximos", "pudo", - "pueda", "puede", "pueden", "puedo", "pues", "q", "qeu", "que", "quedó", "queremos", "quien", "quienes", "quiere", "quiza", "quizas", - "quizá", "quizás", "quién", "quiénes", "qué", "r", "raras", "realizado", "realizar", "realizó", "repente", "respecto", "s", "sabe", - "sabeis", "sabemos", "saben", "saber", "sabes", "salvo", "se", "sea", "sean", "segun", "segunda", "segundo", "según", "seis", "ser", - "sera", "será", "serán", "sería", "señaló", "si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", - "sois", "sola", "solamente", "solas", "solo", "solos", "somos", "son", "soy", "soyos", "su", "supuesto", "sus", "suya", "suyas", "suyo", - "sé", "sí", "sólo", "t", "tal", "tambien", "también", "tampoco", "tan", "tanto", "tarde", "te", "temprano", "tendrá", "tendrán", "teneis", - "tenemos", "tener", "tenga", "tengo", "tenido", "tenía", "tercera", "ti", "tiempo", "tiene", "tienen", "toda", "todas", "todavia", - "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", - "trata", "través", "tres", "tu", "tus", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "tú", "u", "ultimo", "un", "una", "unas", "uno", "unos", - "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "usted", "ustedes", "v", "va", "vais", "valor", "vamos", "van", "varias", "varios", - "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez", "vosotras", "vosotros", "voy", "vuestra", "vuestras", "vuestro", - "vuestros", "w", "x", "y", "ya", "yo", "z", "él", "ésa", "ésas", "ése", "ésos", "ésta", "éstas", "éste", "éstos", "última", "últimas", - "último", "últimos"} + 'retuning the word + porterAlgorithmStep2 = str - ''' - ''' Removes StopWords from sentence - ''' ARAB/ENG/DUTCH/FRENCH/SPANISH/ITALIAN - ''' Hopefully leaving just relevant words in the user sentence - ''' Currently Under Revision (takes too many words) - ''' - ''' - ''' - Public Function RemoveStopWords(ByRef Userinput As String) As String - ' Userinput = LCase(Userinput).Replace("the", "r") - For Each item In StopWordsENG - Userinput = LCase(Userinput).Replace(item, "") - Next - For Each item In StopWordsArab - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsDutch - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsFrench - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsItalian - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsSpanish - Userinput = Userinput.Replace(item, "") - Next - Return Userinput End Function - ''' - ''' Removes Stop words given a list of stop words - ''' - ''' user input - ''' stop word list - ''' - Public Function RemoveStopWords(ByRef Userinput As String, ByRef Lst As List(Of String)) As String - For Each item In Lst - Userinput = LCase(Userinput).Replace(item, "") - Next + Private Shared Function porterAlgorithmStep3(str As String) As String + + On Error Resume Next + + 'STEP 3 + ' + ' (m>0) ICATE -> IC triplicate -> triplic + ' (m>0) ATIVE -> formative -> form + ' (m>0) ALIZE -> AL formalize -> formal + ' (m>0) ICITI -> IC electriciti -> electric + ' (m>0) ICAL -> IC electrical -> electric + ' (m>0) FUL -> hopeful -> hope + ' (m>0) NESS -> goodness -> good + + 'declaring local variables + Dim i As Byte + Dim temp As String + Dim step3(6, 1) As String + + 'initializing contents of 2D array + step3(0, 0) = "icate" + step3(0, 1) = "ic" + step3(1, 0) = "ative" + step3(1, 1) = "" + step3(2, 0) = "alize" + step3(2, 1) = "al" + step3(3, 0) = "iciti" + step3(3, 1) = "ic" + step3(4, 0) = "ical" + step3(4, 1) = "ic" + step3(5, 0) = "ful" + step3(5, 1) = "" + step3(6, 0) = "ness" + step3(6, 1) = "" + + 'checking word + For i = 0 To 6 Step 1 + If porterEndsWith(str, step3(i, 0)) Then + temp = porterTrimEnd(str, Len(step3(i, 0))) + If porterCountm(temp) > 0 Then + str = porterTrimEnd(str, Len(step3(i, 0))) + str = porterAppendEnd(str, step3(i, 1)) + End If + Exit For + End If + Next i + + 'retuning the word + porterAlgorithmStep3 = str + End Function - End Class + Private Shared Function porterAlgorithmStep4(str As String) As String - End Namespace - Public Module TextExtensions + On Error Resume Next - ''' - ''' Add full stop to end of String - ''' - ''' - ''' - - Public Function AddFullStop(ByRef MESSAGE As String) As String - AddFullStop = MESSAGE - If MESSAGE = "" Then Exit Function - MESSAGE = Trim(MESSAGE) - If MESSAGE Like "*." Then Exit Function - AddFullStop = MESSAGE + "." - End Function + 'STEP 4 + ' + ' (m>1) AL -> revival -> reviv + ' (m>1) ANCE -> allowance -> allow + ' (m>1) ENCE -> inference -> infer + ' (m>1) ER -> airliner -> airlin + ' (m>1) IC -> gyroscopic -> gyroscop + ' (m>1) ABLE -> adjustable -> adjust + ' (m>1) IBLE -> defensible -> defens + ' (m>1) ANT -> irritant -> irrit + ' (m>1) EMENT -> replacement -> replac + ' (m>1) MENT -> adjustment -> adjust + ' (m>1) ENT -> dependent -> depend + ' (m>1 and (*S or *T)) ION -> adoption -> adopt + ' (m>1) OU -> homologou -> homolog + ' (m>1) ISM -> communism -> commun + ' (m>1) ATE -> activate -> activ + ' (m>1) ITI -> angulariti -> angular + ' (m>1) OUS -> homologous -> homolog + ' (m>1) IVE -> effective -> effect + ' (m>1) IZE -> bowdlerize -> bowdler + ' + 'The suffixes are now removed. All that remains is a little tidying up. - ''' - ''' Adds string to end of string (no spaces) - ''' - ''' base string - ''' Add before (no spaces) - ''' - - Public Function AddPrefix(ByRef Str As String, ByVal Prefix As String) As String - Return Prefix & Str - End Function + 'declaring local variables + Dim i As Byte + Dim temp As String + Dim step4(18) As String - ''' - ''' Adds Suffix to String (No Spaces) - ''' - ''' Base string - ''' To be added After - ''' - - Public Function AddSuffix(ByRef Str As String, ByVal Suffix As String) As String - Return Str & Suffix - End Function + 'initializing contents of 2D array + step4(0) = "al" + step4(1) = "ance" + step4(2) = "ence" + step4(3) = "er" + step4(4) = "ic" + step4(5) = "able" + step4(6) = "ible" + step4(7) = "ant" + step4(8) = "ement" + step4(9) = "ment" + step4(10) = "ent" + step4(11) = "ion" + step4(12) = "ou" + step4(13) = "ism" + step4(14) = "ate" + step4(15) = "iti" + step4(16) = "ous" + step4(17) = "ive" + step4(18) = "ize" - ''' - ''' GO THROUGH EACH CHARACTER AND ' IF PUNCTUATION IE .!?,:'"; REPLACE WITH A SPACE ' IF , - ''' OR . THEN CHECK IF BETWEEN TWO NUMBERS, IF IT IS ' THEN LEAVE IT, ELSE REPLACE IT WITH A - ''' SPACE ' - ''' - ''' String to be formatted - ''' - ''' - - Public Function AlphaNumericalOnly(ByRef STRINPUT As String) As String + 'checking word + For i = 0 To 18 Step 1 - Dim A As Short - For A = 1 To Len(STRINPUT) - If Mid(STRINPUT, A, 1) = "." Or - Mid(STRINPUT, A, 1) = "!" Or - Mid(STRINPUT, A, 1) = "?" Or - Mid(STRINPUT, A, 1) = "," Or - Mid(STRINPUT, A, 1) = ":" Or - Mid(STRINPUT, A, 1) = "'" Or - Mid(STRINPUT, A, 1) = "[" Or - Mid(STRINPUT, A, 1) = """" Or - Mid(STRINPUT, A, 1) = ";" Then + If porterEndsWith(str, step4(i)) Then - ' BEGIN CHECKING PERIODS AND COMMAS THAT ARE IN BETWEEN NUMBERS ' - If Mid(STRINPUT, A, 1) = "." Or Mid(STRINPUT, A, 1) = "," Then - If Not (A - 1 = 0 Or A = Len(STRINPUT)) Then - If Not (IsNumeric(Mid(STRINPUT, A - 1, 1)) Or IsNumeric(Mid(STRINPUT, A + 1, 1))) Then - STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) - End If - Else - STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) - End If - Else - STRINPUT = Mid(STRINPUT, 1, A - 1) & " " & Mid(STRINPUT, A + 1, Len(STRINPUT) - A) - End If + temp = porterTrimEnd(str, Len(step4(i))) - ' END CHECKING PERIODS AND COMMAS IN BETWEEN NUMBERS ' - End If - Next A - ' RETURN PUNCTUATION STRIPPED STRING ' - AlphaNumericalOnly = STRINPUT.Replace(" ", " ") - End Function + If porterCountm(temp) > 1 Then - - Public Function AlphaNumericOnly(ByRef txt As String) As String - Dim NewText As String = "" - Dim IsLetter As Boolean = False - Dim IsNumerical As Boolean = False - For Each chr As Char In txt - IsNumerical = False - IsLetter = False - For Each item In AlphaBet - If IsLetter = False Then - If chr.ToString = item Then - IsLetter = True - Else - End If - End If - Next - 'Check Numerical - If IsLetter = False Then - For Each item In Numerical - If IsNumerical = False Then - If chr.ToString = item Then - IsNumerical = True + If porterEndsWith(str, "ion") Then + If porterEndsWith(temp, "s") Or porterEndsWith(temp, "t") Then + str = porterTrimEnd(str, Len(step4(i))) + str = porterAppendEnd(str, "") + End If Else + str = porterTrimEnd(str, Len(step4(i))) + str = porterAppendEnd(str, "") End If - End If - Next - Else - End If - If IsLetter = True Or IsNumerical = True Then - NewText &= chr.ToString - Else - NewText &= " " - End If - Next - NewText = NewText.Replace(" ", " ") - Return NewText - End Function - 'Text - - Public Function Capitalise(ByRef MESSAGE As String) As String - Dim FirstLetter As String - Capitalise = "" - If MESSAGE = "" Then Exit Function - FirstLetter = Left(MESSAGE, 1) - FirstLetter = UCase(FirstLetter) - MESSAGE = Right(MESSAGE, Len(MESSAGE) - 1) - Capitalise = (FirstLetter + MESSAGE) - End Function + End If - ''' - ''' Capitalizes the text - ''' - ''' - ''' - - Public Function CapitaliseTEXT(ByVal MESSAGE As String) As String - Dim FirstLetter As String = "" - CapitaliseTEXT = "" - If MESSAGE = "" Then Exit Function - FirstLetter = Left(MESSAGE, 1) - FirstLetter = UCase(FirstLetter) - MESSAGE = Right(MESSAGE, Len(MESSAGE) - 1) - CapitaliseTEXT = (FirstLetter + MESSAGE) - End Function + Exit For - ''' - ''' Capitalise the first letter of each word / Tilte Case - ''' - ''' A string - paragraph or sentence - ''' String - - Public Function CapitalizeWords(ByVal words As String) - Dim output As System.Text.StringBuilder = New System.Text.StringBuilder() - Dim exploded = words.Split(" ") - If (exploded IsNot Nothing) Then - For Each word As String In exploded - If word IsNot Nothing Then - output.Append(word.Substring(0, 1).ToUpper).Append(word.Substring(1, word.Length - 1)).Append(" ") End If - Next - End If + Next i - Return output.ToString() + 'retuning the word + porterAlgorithmStep4 = str - End Function + End Function - ''' - ''' A string extension method that query if this object contains the given value. - ''' - ''' The @this to act on. - ''' The value. - ''' true if the value is in the string, false if not. - - Public Function Contains(this As String, value As String) As Boolean - Return this.IndexOf(value) <> -1 - End Function + Private Shared Function porterAlgorithmStep5(str As String) As String - ''' - ''' A string extension method that query if this object contains the given value. - ''' - ''' The @this to act on. - ''' The value. - ''' Type of the comparison. - ''' true if the value is in the string, false if not. - - Public Function Contains(this As String, value As String, comparisonType As StringComparison) As Boolean - Return this.IndexOf(value, comparisonType) <> -1 - End Function + On Error Resume Next - ''' - ''' Checks if String Contains Letters - ''' - ''' - ''' - - Public Function ContainsLetters(ByVal str As String) As Boolean + 'STEP 5a + ' + ' (m>1) E -> probate -> probat + ' rate -> rate + ' (m=1 and not *o) E -> cease -> ceas + ' + 'STEP 5b + ' + ' (m>1 and *d and *L) -> single letter + ' controll -> control + ' roll -> roll - For i = 0 To str.Length - 1 - If Char.IsLetter(str.Chars(i)) Then - Return True + 'declaring local variables + Dim i As Byte + Dim temp As String + + 'Step5a + If porterEndsWith(str, "e") Then 'word ends with e + temp = porterTrimEnd(str, 1) + If porterCountm(temp) > 1 Then 'm>1 + str = porterTrimEnd(str, 1) + ElseIf porterCountm(temp) = 1 Then 'm=1 + If Not porterEndsCVC(temp) Then 'not *o + str = porterTrimEnd(str, 1) + End If + End If End If - Next - Return False + '-------------------------------------------------------------------------------------------------------- + ' + 'Step5b + If porterCountm(str) > 1 Then + If porterEndsDoubleConsonent(str) And porterEndsWith(str, "l") Then + str = porterTrimEnd(str, 1) + End If + End If - End Function + 'retuning the word + porterAlgorithmStep5 = str - ''' - ''' Counts the number of elements in the text, useful for declaring arrays when the element - ''' length is unknown could be used to split sentence on full stop Find Sentences then again - ''' on comma(conjunctions) "Find Clauses" NumberOfElements = CountElements(Userinput, delimiter) - ''' - ''' - ''' - ''' Integer : number of elements found - ''' - - Public Function CountElements(ByVal PHRASE As String, ByVal Delimiter As String) As Integer - Dim elementcounter As Integer = 0 - Dim PhraseArray As String() - PhraseArray = PHRASE.Split(Delimiter) - elementcounter = UBound(PhraseArray) - Return elementcounter - End Function + End Function - ''' - ''' counts occurrences of a specific phoneme - ''' - ''' - ''' - ''' - ''' - - Public Function CountOccurrences(ByRef strIn As String, ByRef strFind As String) As Integer - '** - ' Returns: the number of times a string appears in a string - ' - '@rem Example code for CountOccurrences() - ' - ' ' Counts the occurrences of "ow" in the supplied string. - ' - ' strTmp = "How now, brown cow" - ' Returns a value of 4 - ' - ' - 'Debug.Print "CountOccurrences(): there are " & CountOccurrences(strTmp, "ow") & - '" occurrences of 'ow'" & " in the string '" & strTmp & "'" - ' - '@param strIn Required. String. - '@param strFind Required. String. - '@return Long. - - Dim lngPos As Integer - Dim lngWordCount As Integer - - On Error GoTo PROC_ERR - - lngWordCount = 1 - - ' Find the first occurrence - lngPos = InStr(strIn, strFind) - - Do While lngPos > 0 - ' Find remaining occurrences - lngPos = InStr(lngPos + 1, strIn, strFind) - If lngPos > 0 Then - ' Increment the hit counter - lngWordCount = lngWordCount + 1 - End If - Loop + Private Shared Function porterAppendEnd(str As String, ends As String) As String - ' Return the value - CountOccurrences = lngWordCount + On Error Resume Next -PROC_EXIT: - Exit Function + 'returning the appended string + porterAppendEnd = str + ends -PROC_ERR: - MsgBox("Error: " & Err.Number & ". " & Err.Description, , NameOf(CountOccurrences)) - Resume PROC_EXIT + End Function - End Function + Private Shared Function porterContains(str As String, present As String) As Boolean - - Public Function CountVowels(ByVal InputString As String) As Integer - Dim v(9) As String 'Declare an array of 10 elements 0 to 9 - Dim vcount As Short 'This variable will contain number of vowels - Dim flag As Integer - Dim strLen As Integer - Dim i As Integer - v(0) = "a" 'First element of array is assigned small a - v(1) = "i" - v(2) = "o" - v(3) = "u" - v(4) = "e" - v(5) = "A" 'Sixth element is assigned Capital A - v(6) = "I" - v(7) = "O" - v(8) = "U" - v(9) = "E" - strLen = Len(InputString) - - For flag = 1 To strLen 'It will get every letter of entered string and loop - 'will terminate when all letters have been examined - - For i = 0 To 9 'Takes every element of v(9) one by one - 'Check if current letter is a vowel - If Mid(InputString, flag, 1) = v(i) Then - vcount = vcount + 1 ' If letter is equal to vowel - 'then increment vcount by 1 - End If - Next i 'Consider next value of v(i) - Next flag 'Consider next letter of the entered string + On Error Resume Next - CountVowels = vcount + 'checking whether strr contains present + porterContains = If(InStr(str, present) = 0, False, True) - End Function + End Function - ''' - ''' Counts tokens in string - ''' - ''' string to be searched - ''' delimiter such as space comma etc - ''' - - Public Function CountTokensInString(ByRef Str As String, ByRef Delimiter As String) As Integer - Dim Words() As String = Split(Str, Delimiter) - Return Words.Count - End Function + Private Shared Function porterContainsVowel(str As String) As Boolean - ''' - ''' Counts the words in a given text - ''' - ''' - ''' integer: number of words - ''' - - Public Function CountWords(NewText As String) As Integer - Dim TempArray() As String = NewText.Split(" ") - CountWords = UBound(TempArray) - Return CountWords - End Function + 'checking word to see if vowels are present - ''' - ''' checks Str contains keyword regardless of case - ''' - ''' - ''' - ''' - - Public Function DetectKeyWord(ByRef Userinput As String, ByRef Keyword As String) As Boolean - Dim mfound As Boolean = False - If UCase(Userinput).Contains(UCase(Keyword)) = True Or - InStr(Userinput, Keyword) > 1 Then - mfound = True - End If + Dim pattern As String - Return mfound - End Function + If Len(str) >= 0 Then - ''' - ''' DETECT IF STATMENT IS AN IF/THEN DETECT IF STATMENT IS AN IF/THEN -- -RETURNS PARTS DETIFTHEN - ''' = DETECTLOGIC(USERINPUT, "IF", "THEN", IFPART, THENPART) - ''' - ''' - ''' "IF", can also be replace by "IT CAN BE SAID THAT - ''' "THEN" can also be replaced by "it must follow that" - ''' supply empty string to be used to hold part - ''' supply empty string to be used to hold part - ''' true/false - ''' - - Public Function DetectLOGIC(ByRef userinput As String, ByRef LOGICA As String, ByRef LOGICB As String, ByRef IFPART As String, ByRef THENPART As String) As Boolean - If InStr(1, userinput, LOGICA, 1) > 0 And InStr(1, userinput, " " & LOGICB & " ", 1) > 0 Then - 'SPLIT USER INPUT - Call SplitPhrase(userinput, " " & LOGICB & " ", IFPART, THENPART) - - IFPART = Replace(IFPART, LOGICA, "", 1, -1, CompareMethod.Text) - THENPART = Replace(THENPART, " " & LOGICB & " ", "", 1, -1, CompareMethod.Text) - DetectLOGIC = True - Else - DetectLOGIC = False - End If - End Function + 'find out the CVC pattern + pattern = returnCVCpattern(str) - ''' - ''' Expand a string such as a field name by inserting a space ahead of each capitalized - ''' letter (where none exists). - ''' - ''' - ''' Expanded string - ''' - - Public Function ExpandToWords(ByVal inputString As String) As String - If inputString Is Nothing Then Return Nothing - Dim charArray = inputString.ToCharArray - Dim outStringBuilder As New System.Text.StringBuilder(inputString.Length + 10) - For index = 0 To charArray.GetUpperBound(0) - If Char.IsUpper(charArray(index)) Then - 'If previous character is also uppercase, don't expand as this may be an acronym. - If (index > 0) AndAlso Char.IsUpper(charArray(index - 1)) Then - outStringBuilder.Append(charArray(index)) - Else - outStringBuilder.Append(String.Concat(" ", charArray(index))) - End If + 'check to see if the return pattern contains a vowel + porterContainsVowel = If(InStr(pattern, "v") = 0, False, True) Else - outStringBuilder.Append(charArray(index)) + porterContainsVowel = False End If - Next - Return outStringBuilder.ToString.Replace("_", " ").Trim + End Function - End Function + Private Shared Function porterCountm(str As String) As Byte - ''' - ''' A string extension method that extracts this object. - ''' - ''' The @this to act on. - ''' The predicate. - ''' A string. - - Public Function Extract(this As String, predicate As Func(Of Char, Boolean)) As String - Return New String(this.ToCharArray().Where(predicate).ToArray()) - End Function + On Error Resume Next - - Public Function ExtractFirstChar(ByRef InputStr As String) As String + 'A \consonant\ in a word is a letter other than A, E, I, O or U, and other + 'than Y preceded by a consonant. (The fact that the term `consonant' is + 'defined to some extent in terms of itself does not make it ambiguous.) So in + 'TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a + 'letter is not a consonant it is a \vowel\. - ExtractFirstChar = Left(InputStr, 1) - End Function + 'declaring local variables + Dim chars() As Byte + Dim const_vowel As String + Dim i As Byte + Dim m As Byte + Dim flag As Boolean + Dim pattern As String - - Public Function ExtractFirstWord(ByRef Statement As String) As String - Dim StrArr() As String = Split(Statement, " ") - Return StrArr(0) - End Function + 'initializing + const_vowel = "" + m = 0 + flag = False - - Public Function ExtractLastChar(ByRef InputStr As String) As String + If Not Len(str) = 0 Then - ExtractLastChar = Right(InputStr, 1) - End Function + 'find out the CVC pattern + pattern = returnCVCpattern(str) - ''' - ''' Returns The last word in String - ''' NOTE: String ois converted to Array then the last element is extracted Count-1 - ''' - ''' - ''' String - - Public Function ExtractLastWord(ByRef InputStr As String) As String - Dim TempArr() As String = Split(InputStr, " ") - Dim Count As Integer = TempArr.Count - 1 - Return TempArr(Count) - End Function + 'converting const_vowel to byte array + chars = System.Text.Encoding.Unicode.GetBytes(pattern) - ''' - ''' A string extension method that extracts the letter described by @this. - ''' - ''' The @this to act on. - ''' The extracted letter. - - Public Function ExtractLetter(this As String) As String - Return New String(this.ToCharArray().Where(Function(x) [Char].IsLetter(x)).ToArray()) - End Function + 'counting the number of m's... + For i = 0 To UBound(chars) Step 1 + If Chr(chars(i)) = "v" Or flag = True Then + flag = True + If Chr(chars(i)) = "c" Then + m = m + 1 + flag = False + End If + End If + Next i - ''' - ''' A string extension method that extracts the number described by @this. - ''' - ''' The @this to act on. - ''' The extracted number. - - Public Function ExtractNumber(this As String) As String - Return New String(this.ToCharArray().Where(Function(x) [Char].IsNumber(x)).ToArray()) - End Function + End If - ''' - ''' extracts string between defined strings - ''' - ''' base sgtring - ''' Start string - ''' End string - ''' - - Public Function ExtractStringBetween(ByVal value As String, ByVal strStart As String, ByVal strEnd As String) As String - If Not String.IsNullOrEmpty(value) Then - Dim i As Integer = value.IndexOf(strStart) - Dim j As Integer = value.IndexOf(strEnd) - Return value.Substring(i, j - i) - Else - Return value - End If - End Function + porterCountm = m + + End Function + + Private Shared Function porterEndsCVC(str As String) As Boolean + + On Error Resume Next + + '*o - the stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP). + + 'declaring local variables + Dim chars() As Byte + Dim const_vowel As String + Dim i As Byte + Dim pattern As String - ''' - ''' Extract words Either side of Divider - ''' - ''' - ''' - ''' Front = F Back =B - ''' - - Public Function ExtractWordsEitherSide(ByRef TextStr As String, ByRef Divider As String, ByRef Mode As String) As String - ExtractWordsEitherSide = "" - Select Case Mode - Case "F" - Return ExtractWordsEitherSide(TextStr, Divider, "F") - Case "B" - Return ExtractWordsEitherSide(TextStr, Divider, "B") - End Select + 'check to see if atleast 3 characters are present + If Len(str) >= 3 Then - End Function + 'converting string to byte array - ' Generate a random number based on the upper and lower bounds of the array, - 'then use that to return the item. - - Public Function FetchRandomItem(Of t)(ByRef theArray() As t) As t + chars = System.Text.Encoding.Unicode.GetBytes(str) - Dim randNumberGenerator As New Random - Randomize() - Dim index As Integer = randNumberGenerator.Next(theArray.GetLowerBound(0), - theArray.GetUpperBound(0) + 1) + 'find out the CVC pattern + pattern = returnCVCpattern(str) - Return theArray(index) + 'we need to check only the last three characters + pattern = Right(pattern, 3) - End Function + 'check to see if the letters in str match the sequence cvc + porterEndsCVC = If(pattern = "cvc", If(Not (Chr(chars(UBound(chars))) = "w" Or Chr(chars(UBound(chars))) = "x" Or Chr(chars(UBound(chars))) = "y"), True, False), False) + Else - ''' - ''' Define the search terms. This list could also be dynamically populated at runtime Find - ''' sentences that contain all the terms in the wordsToMatch array Note that the number of - ''' terms to match is not specified at compile time - ''' - ''' String to be searched - ''' List of Words to be detected - ''' Sentences containing words - - Public Function FindSentencesContaining(ByRef TextStr1 As String, ByRef Words As List(Of String)) As List(Of String) - ' Split the text block into an array of sentences. - Dim sentences As String() = TextStr1.Split(New Char() {".", "?", "!"}) - - Dim wordsToMatch(Words.Count) As String - Dim I As Integer = 0 - For Each item In Words - wordsToMatch(I) = item - I += 1 - Next + porterEndsCVC = False - Dim sentenceQuery = From sentence In sentences - Let w = sentence.Split(New Char() {" ", ",", ".", ";", ":"}, - StringSplitOptions.RemoveEmptyEntries) - Where w.Distinct().Intersect(wordsToMatch).Count = wordsToMatch.Count() - Select sentence + End If - ' Execute the query + End Function - Dim StrList As New List(Of String) - For Each str As String In sentenceQuery - StrList.Add(str) - Next - Return StrList - End Function + Private Shared Function porterEndsDoubleConsonent(str As String) As Boolean - - Public Function FormatJsonOutput(ByVal jsonString As String) As String - Dim stringBuilder = New StringBuilder() - Dim escaping As Boolean = False - Dim inQuotes As Boolean = False - Dim indentation As Integer = 0 + On Error Resume Next - For Each character As Char In jsonString + 'checking whether word ends with a double consonant (e.g. -TT, -SS). - If escaping Then - escaping = False - stringBuilder.Append(character) - Else + 'declaring local variables + Dim holds_ends As String + Dim hold_third_last As String + Dim chars() As Byte - If character = "\"c Then - escaping = True - stringBuilder.Append(character) - ElseIf character = """"c Then - inQuotes = Not inQuotes - stringBuilder.Append(character) - ElseIf Not inQuotes Then + 'first check whether the size of the word is >= 2 + If Len(str) >= 2 Then - If character = ","c Then - stringBuilder.Append(character) - stringBuilder.Append(vbCrLf) - stringBuilder.Append(vbTab, indentation) - ElseIf character = "["c OrElse character = "{"c Then - stringBuilder.Append(character) - stringBuilder.Append(vbCrLf) - stringBuilder.Append(vbTab, System.Threading.Interlocked.Increment(indentation)) - ElseIf character = "]"c OrElse character = "}"c Then - stringBuilder.Append(vbCrLf) - stringBuilder.Append(vbTab, System.Threading.Interlocked.Decrement(indentation)) - stringBuilder.Append(character) - ElseIf character = ":"c Then - stringBuilder.Append(character) - stringBuilder.Append(vbTab) - ElseIf Not Char.IsWhiteSpace(character) Then - stringBuilder.Append(character) - End If - Else - stringBuilder.Append(character) - End If - End If - Next + 'extract 2 characters from right of str + holds_ends = Right(str, 2) - Return stringBuilder.ToString() - End Function + 'converting string to byte array + chars = System.Text.Encoding.Unicode.GetBytes(holds_ends) - - Public Function FormatText(ByRef Text As String) As String - Dim FormatTextResponse As String = "" - 'FORMAT USERINPUT - 'turn to uppercase for searching the db - Text = LTrim(Text) - Text = RTrim(Text) - Text = UCase(Text) - - FormatTextResponse = Text - Return FormatTextResponse - End Function + 'checking if both the characters are same + If chars(0) = chars(1) Then - ''' - ''' Gets the string after the given string parameter. - ''' - ''' The default value. - ''' The given string parameter. - ''' - ''' Unlike GetBefore, this method trims the result - - Public Function GetAfter(value As String, x As String) As String - Dim xPos = value.LastIndexOf(x, StringComparison.Ordinal) - If xPos = -1 Then - Return [String].Empty - End If - Dim startIndex = xPos + x.Length - Return If(startIndex >= value.Length, [String].Empty, value.Substring(startIndex).Trim()) - End Function + 'check for double consonent + If holds_ends = "aa" Or holds_ends = "ee" Or holds_ends = "ii" Or holds_ends = "oo" Or holds_ends = "uu" Then - ''' - ''' Gets the string before the given string parameter. - ''' - ''' The default value. - ''' The given string parameter. - ''' - ''' Unlike GetBetween and GetAfter, this does not Trim the result. - - Public Function GetBefore(value As String, x As String) As String - Dim xPos = value.IndexOf(x, StringComparison.Ordinal) - Return If(xPos = -1, [String].Empty, value.Substring(0, xPos)) - End Function + porterEndsDoubleConsonent = False + Else - ''' - ''' Gets the string between the given string parameters. - ''' - ''' The source value. - ''' The left string sentinel. - ''' The right string sentinel - ''' - ''' Unlike GetBefore, this method trims the result - - Public Function GetBetween(value As String, x As String, y As String) As String - Dim xPos = value.IndexOf(x, StringComparison.Ordinal) - Dim yPos = value.LastIndexOf(y, StringComparison.Ordinal) - If xPos = -1 OrElse xPos = -1 Then - Return [String].Empty - End If - Dim startIndex = xPos + x.Length - Return If(startIndex >= yPos, [String].Empty, value.Substring(startIndex, yPos - startIndex).Trim()) - End Function + 'if the second last character is y, and there are atleast three letters in str + If holds_ends = "yy" And Len(str) > 2 Then - ''' - ''' Returns the first Word - ''' - ''' - ''' - - Public Function GetPrefix(ByRef Statement As String) As String - Dim StrArr() As String = Split(Statement, " ") - Return StrArr(0) - End Function + 'extracting the third last character + hold_third_last = Right(str, 3) + hold_third_last = Left(str, 1) - - Public Function GetRandItemfromList(ByRef li As List(Of String)) As String - Randomize() - Return li.Item(Int(Rnd() * (li.Count - 1))) - End Function + porterEndsDoubleConsonent = If(Not (hold_third_last = "a" Or hold_third_last = "e" Or hold_third_last = "i" Or hold_third_last = "o" Or hold_third_last = "u"), False, True) + Else - ''' - ''' Returns random character from string given length of the string to choose from - ''' - ''' - ''' - ''' - - Public Function GetRndChar(ByVal Source As String, ByVal Length As Integer) As String - Dim rnd As New Random - If Source Is Nothing Then Throw New ArgumentNullException(NameOf(Source), "Must contain a string,") - If Length <= 0 Then Throw New ArgumentException("Length must be a least one.", NameOf(Length)) - Dim s As String = "" - Dim builder As New System.Text.StringBuilder() - builder.Append(s) - For i = 1 To Length - builder.Append(Source(rnd.Next(0, Source.Length))) - Next - s = builder.ToString() - Return s - End Function + porterEndsDoubleConsonent = True - ''' - ''' Returns from index to end of file - ''' - ''' String - ''' Index - ''' - - Public Function GetSlice(ByRef Str As String, ByRef indx As Integer) As String - If indx <= Str.Length Then - Str.Substring(indx, Str.Length) - Return Str(indx) - Else - End If - Return Nothing - End Function + End If - ''' - ''' gets the last word - ''' - ''' - ''' - - Public Function GetSuffix(ByRef InputStr As String) As String - Dim TempArr() As String = Split(InputStr, " ") - Dim Count As Integer = TempArr.Count - 1 - Return TempArr(Count) - End Function + End If + Else - - Public Function GetWordsBetween(ByRef InputStr As String, ByRef StartStr As String, ByRef StopStr As String) - Return InputStr.ExtractStringBetween(StartStr, StopStr) - End Function + porterEndsDoubleConsonent = False - ''' - ''' A string extension method that query if '@this' satisfy the specified pattern. - ''' - ''' The @this to act on. - ''' The pattern to use. Use '*' as wildcard string. - ''' true if '@this' satisfy the specified pattern, false if not. - - Public Function IsLike(this As String, pattern As String) As Boolean - ' Turn the pattern into regex pattern, and match the whole string with ^$ - Dim regexPattern As String = "^" + Regex.Escape(pattern) + "$" - - ' Escape special character ?, #, *, [], and [!] - regexPattern = regexPattern.Replace("\[!", "[^").Replace("\[", "[").Replace("\]", "]").Replace("\?", ".").Replace("\*", ".*").Replace("\#", "\d") - - Return Regex.IsMatch(this, regexPattern) - End Function + End If + Else - ''' - ''' Checks if string is a reserved VBscipt Keyword - ''' - ''' - ''' - - Function IsReservedWord(ByVal keyword As String) As Boolean - Dim IsReserved = False - Select Case LCase(keyword) - Case "and" : IsReserved = True - Case "as" : IsReserved = True - Case "boolean" : IsReserved = True - Case "byref" : IsReserved = True - Case "byte" : IsReserved = True - Case "byval" : IsReserved = True - Case "call" : IsReserved = True - Case "case" : IsReserved = True - Case "class" : IsReserved = True - Case "const" : IsReserved = True - Case "currency" : IsReserved = True - Case "debug" : IsReserved = True - Case "dim" : IsReserved = True - Case "do" : IsReserved = True - Case "double" : IsReserved = True - Case "each" : IsReserved = True - Case "else" : IsReserved = True - Case "elseif" : IsReserved = True - Case "empty" : IsReserved = True - Case "end" : IsReserved = True - Case "endif" : IsReserved = True - Case "enum" : IsReserved = True - Case "eqv" : IsReserved = True - Case "event" : IsReserved = True - Case "exit" : IsReserved = True - Case "false" : IsReserved = True - Case "for" : IsReserved = True - Case "function" : IsReserved = True - Case "get" : IsReserved = True - Case "goto" : IsReserved = True - Case "if" : IsReserved = True - Case "imp" : IsReserved = True - Case "implements" : IsReserved = True - Case "in" : IsReserved = True - Case "integer" : IsReserved = True - Case "is" : IsReserved = True - Case "let" : IsReserved = True - Case "like" : IsReserved = True - Case "long" : IsReserved = True - Case "loop" : IsReserved = True - Case "lset" : IsReserved = True - Case "me" : IsReserved = True - Case "mod" : IsReserved = True - Case "new" : IsReserved = True - Case "next" : IsReserved = True - Case "not" : IsReserved = True - Case "nothing" : IsReserved = True - Case "null" : IsReserved = True - Case "on" : IsReserved = True - Case "option" : IsReserved = True - Case "optional" : IsReserved = True - Case "or" : IsReserved = True - Case "paramarray" : IsReserved = True - Case "preserve" : IsReserved = True - Case "private" : IsReserved = True - Case "public" : IsReserved = True - Case "raiseevent" : IsReserved = True - Case "redim" : IsReserved = True - Case "rem" : IsReserved = True - Case "resume" : IsReserved = True - Case "rset" : IsReserved = True - Case "select" : IsReserved = True - Case "set" : IsReserved = True - Case "shared" : IsReserved = True - Case "single" : IsReserved = True - Case "static" : IsReserved = True - Case "stop" : IsReserved = True - Case "sub" : IsReserved = True - Case "then" : IsReserved = True - Case "to" : IsReserved = True - Case "true" : IsReserved = True - Case "type" : IsReserved = True - Case "typeof" : IsReserved = True - Case "until" : IsReserved = True - Case "variant" : IsReserved = True - Case "wend" : IsReserved = True - Case "while" : IsReserved = True - Case "with" : IsReserved = True - Case "xor" : IsReserved = True - End Select - Return IsReserved - End Function + porterEndsDoubleConsonent = False - ''' - ''' Returns Propercase Sentence - ''' - ''' String to be formatted - ''' - - Public Function ProperCase(ByRef TheString As String) As String - ProperCase = UCase(Left(TheString, 1)) + End If - For i = 2 To Len(TheString) + End Function - ProperCase = If(Mid(TheString, i - 1, 1) = " ", ProperCase & UCase(Mid(TheString, i, 1)), ProperCase & LCase(Mid(TheString, i, 1))) - Next i - End Function + Private Shared Function porterEndsWith(str As String, ends As String) As Boolean - - Public Function RemoveBrackets(ByRef Txt As String) As String - 'Brackets - Txt = Txt.Replace("(", "") - Txt = Txt.Replace("{", "") - Txt = Txt.Replace("}", "") - Txt = Txt.Replace("[", "") - Txt = Txt.Replace("]", "") - Return Txt - End Function + On Error Resume Next - - Public Function RemoveFullStop(ByRef MESSAGE As String) As String -Loop1: - If Right(MESSAGE, 1) = "." Then MESSAGE = Left(MESSAGE, Len(MESSAGE) - 1) : GoTo Loop1 - Return MESSAGE - End Function + 'declaring local variables + Dim length_str As Byte + Dim length_ends As Byte + Dim hold_ends As String - ''' - ''' A string extension method that removes the letter described by @this. - ''' - ''' The @this to act on. - ''' A string. - - Public Function RemoveLetter(this As String) As String - Return New String(this.ToCharArray().Where(Function(x) Not [Char].IsLetter(x)).ToArray()) - End Function + 'finding the length of the string + length_str = Len(str) + length_ends = Len(ends) - - Public Function RemoveMathsSymbols(ByRef Txt As String) As String - 'Maths Symbols - Txt = Txt.Replace("+", "") - Txt = Txt.Replace("=", "") - Txt = Txt.Replace("-", "") - Txt = Txt.Replace("/", "") - Txt = Txt.Replace("*", "") - Txt = Txt.Replace("<", "") - Txt = Txt.Replace(">", "") - Txt = Txt.Replace("%", "") - Return Txt - End Function + 'if length of str is greater than the length of length_ends, only then proceed..else return false + If length_ends >= length_str Then - ''' - ''' A string extension method that removes the number described by @this. - ''' - ''' The @this to act on. - ''' A string. - - Public Function RemoveNumber(this As String) As String - Return New String(this.ToCharArray().Where(Function(x) Not [Char].IsNumber(x)).ToArray()) - End Function + porterEndsWith = False + Else - - Public Function RemovePunctuation(ByRef Txt As String) As String - 'Punctuation - Txt = Txt.Replace(",", "") - Txt = Txt.Replace(".", "") - Txt = Txt.Replace(";", "") - Txt = Txt.Replace("'", "") - Txt = Txt.Replace("_", "") - Txt = Txt.Replace("?", "") - Txt = Txt.Replace("!", "") - Txt = Txt.Replace("&", "") - Txt = Txt.Replace(":", "") - - Return Txt - End Function + 'extract characters from right of str + hold_ends = Right(str, length_ends) - ''' - ''' Removes StopWords from sentence - ''' ARAB/ENG/DUTCH/FRENCH/SPANISH/ITALIAN - ''' Hopefully leaving just relevant words in the user sentence - ''' Currently Under Revision (takes too many words) - ''' - ''' - ''' - - Public Function RemoveStopWords(ByRef Userinput As String) As String - ' Userinput = LCase(Userinput).Replace("the", "r") - For Each item In StopWordsENG - Userinput = LCase(Userinput).Replace(item, "") - Next - For Each item In StopWordsArab - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsDutch - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsFrench - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsItalian - Userinput = Userinput.Replace(item, "") - Next - For Each item In StopWordsSpanish - Userinput = Userinput.Replace(item, "") - Next - Return Userinput - End Function + 'comparing to see whether hold_ends=ends + porterEndsWith = If(StrComp(hold_ends, ends) = 0, True, False) - - Public Function RemoveStopWords(ByRef txt As String, ByRef StopWrds As List(Of String)) As String - For Each item In StopWrds - txt = txt.Replace(item, "") - Next - Return txt - End Function + End If - - Public Function RemoveSymbols(ByRef Txt As String) As String - 'Basic Symbols - Txt = Txt.Replace("£", "") - Txt = Txt.Replace("$", "") - Txt = Txt.Replace("^", "") - Txt = Txt.Replace("@", "") - Txt = Txt.Replace("#", "") - Txt = Txt.Replace("~", "") - Txt = Txt.Replace("\", "") - Return Txt - End Function + End Function - ''' - ''' A string extension method that removes the letter. - ''' - ''' The @this to act on. - ''' The predicate. - ''' A string. - - Public Function RemoveWhere(this As String, predicate As Func(Of Char, Boolean)) As String - Return New String(this.ToCharArray().Where(Function(x) Not predicate(x)).ToArray()) - End Function + Private Shared Function porterTrimEnd(str As String, length As Byte) As String - ''' - ''' Advanced search String pattern Wildcard denotes which position 1st =1 or 2nd =2 Send - ''' Original input > Search pattern to be used > Wildcard requred SPattern = "WHAT - ''' COLOUR DO YOU LIKE * OR *" Textstr = "WHAT COLOUR DO YOU LIKE red OR black" ITEM_FOUND = - ''' = SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS RED ITEM_FOUND = = - ''' SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS black - ''' - ''' - ''' TextStr Required. String.EG: "WHAT COLOUR DO YOU LIKE red OR black" - ''' - ''' - ''' SPattern Required. String.EG: "WHAT COLOUR DO YOU LIKE * OR *" - ''' - ''' Wildcard Required. Integer.EG: 1st =1 or 2nd =2 - ''' - ''' * in search pattern - - Public Function SearchPattern(ByRef TextSTR As String, ByRef SPattern As String, ByRef Wildcard As Short) As String - Dim SearchP2 As String - Dim SearchP1 As String - Dim TextStrp3 As String - Dim TextStrp4 As String - SearchPattern = "" - SearchP2 = "" - SearchP1 = "" - TextStrp3 = "" - TextStrp4 = "" - If TextSTR Like SPattern = True Then - Select Case Wildcard - Case 1 - Call SplitPhrase(SPattern, "*", SearchP1, SearchP2) - TextSTR = Replace(TextSTR, SearchP1, "", 1, -1, CompareMethod.Text) - - SearchP2 = Replace(SearchP2, "*", "", 1, -1, CompareMethod.Text) - Call SplitPhrase(TextSTR, SearchP2, TextStrp3, TextStrp4) - - TextSTR = TextStrp3 - - Case 2 - Call SplitPhrase(SPattern, "*", SearchP1, SearchP2) - SPattern = Replace(SPattern, SearchP1, " ", 1, -1, CompareMethod.Text) - TextSTR = Replace(TextSTR, SearchP1, " ", 1, -1, CompareMethod.Text) - - Call SplitPhrase(SearchP2, "*", SearchP1, SearchP2) - Call SplitPhrase(TextSTR, SearchP1, TextStrp3, TextStrp4) - - TextSTR = TextStrp4 + On Error Resume Next - End Select + 'returning the trimmed string + porterTrimEnd = Left(str, Len(str) - length) - SearchPattern = TextSTR - LTrim(SearchPattern) - RTrim(SearchPattern) - Else - End If + End Function - End Function + Private Shared Function returnCVCpattern(str As String) As String - ''' - ''' Advanced search String pattern Wildcard denotes which position 1st =1 or 2nd =2 Send - ''' Original input > Search pattern to be used > Wildcard requred SPattern = "WHAT - ''' COLOUR DO YOU LIKE * OR *" Textstr = "WHAT COLOUR DO YOU LIKE red OR black" ITEM_FOUND = - ''' = SearchPattern(USERINPUT, SPattern, 1) ---- RETURNS RED ITEM_FOUND = = - ''' SearchPattern(USERINPUT, SPattern, 2) ---- RETURNS black - ''' - ''' TextStr = "Pick Red OR Blue" . String. - ''' Search String = ("Pick * OR *") String. - ''' Wildcard Required. Integer. = 1= Red / 2= Blue - ''' - ''' finds the * in search pattern - - Public Function SearchStringbyPattern(ByRef TextSTR As String, ByRef SPattern As String, ByRef Wildcard As Short) As String - Dim SearchP2 As String - Dim SearchP1 As String - Dim TextStrp3 As String - Dim TextStrp4 As String - SearchStringbyPattern = "" - SearchP2 = "" - SearchP1 = "" - TextStrp3 = "" - TextStrp4 = "" - If TextSTR Like SPattern = True Then - Select Case Wildcard - Case 1 - Call SplitString(SPattern, "*", SearchP1, SearchP2) - TextSTR = Replace(TextSTR, SearchP1, "", 1, -1, CompareMethod.Text) - - SearchP2 = Replace(SearchP2, "*", "", 1, -1, CompareMethod.Text) - Call SplitString(TextSTR, SearchP2, TextStrp3, TextStrp4) - - TextSTR = TextStrp3 - - Case 2 - Call SplitString(SPattern, "*", SearchP1, SearchP2) - SPattern = Replace(SPattern, SearchP1, " ", 1, -1, CompareMethod.Text) - TextSTR = Replace(TextSTR, SearchP1, " ", 1, -1, CompareMethod.Text) - - Call SplitString(SearchP2, "*", SearchP1, SearchP2) - Call SplitString(TextSTR, SearchP1, TextStrp3, TextStrp4) - - TextSTR = TextStrp4 + 'local variables + Dim chars() As Byte + Dim const_vowel As String = "" + Dim i As Byte - End Select + 'converting string to byte array + chars = System.Text.Encoding.Unicode.GetBytes(str) - SearchStringbyPattern = TextSTR - LTrim(SearchStringbyPattern) - RTrim(SearchStringbyPattern) - Else - End If + 'checking each character to see if it is a consonent or a vowel. also inputs the information in const_vowel + For i = 0 To UBound(chars) Step 1 - End Function + If Chr(chars(i)) = "a" Or Chr(chars(i)) = "e" Or Chr(chars(i)) = "i" Or Chr(chars(i)) = "o" Or Chr(chars(i)) = "u" Then + const_vowel = const_vowel + "v" + ElseIf Chr(chars(i)) = "y" Then + 'if y is not the first character, only then check the previous character + 'check to see if previous character is a consonent + const_vowel = If(i > 0, If(Not (Chr(chars(i - 1)) = "a" Or Chr(chars(i - 1)) = "e" Or Chr(chars(i - 1)) = "i" Or Chr(chars(i - 1)) = "o" Or Chr(chars(i - 1)) = "u"), const_vowel + "v", const_vowel + "c"), const_vowel + "c") + Else + const_vowel = const_vowel + "c" + End If - - Public Function SpaceItems(ByRef txt As String, Item As String) As String - Return txt.Replace(Item, " " & Item & " ") - End Function + Next i - - Public Function SpacePunctuation(ByRef Txt As String) As String - For Each item In Symbols - Txt = SpaceItems(Txt, item) - Next - For Each item In EncapuslationPunctuationEnd - Txt = SpaceItems(Txt, item) - Next - For Each item In EncapuslationPunctuationStart - Txt = SpaceItems(Txt, item) - Next - For Each item In GramaticalPunctuation - Txt = SpaceItems(Txt, item) - Next - For Each item In MathPunctuation - Txt = SpaceItems(Txt, item) - Next - For Each item In MoneyPunctuation - Txt = SpaceItems(Txt, item) - Next - Return Txt - End Function + returnCVCpattern = const_vowel - ''' - ''' SPLITS THE GIVEN PHRASE UP INTO TWO PARTS by dividing word SplitPhrase(Userinput, "and", - ''' Firstp, SecondP) - ''' - ''' Sentence to be divided - ''' String: Word to divide sentence by - ''' String: firstpart of sentence to be populated - ''' String: Secondpart of sentence to be populated - ''' - - Public Sub SplitPhrase(ByVal PHRASE As String, ByRef DIVIDINGWORD As String, ByRef FIRSTPART As String, ByRef SECONDPART As String) - Dim POS As Short - POS = InStr(PHRASE, DIVIDINGWORD) - If (POS > 0) Then - FIRSTPART = Trim(Left(PHRASE, POS - 1)) - SECONDPART = Trim(Right(PHRASE, Len(PHRASE) - POS - Len(DIVIDINGWORD) + 1)) - Else - FIRSTPART = "" - SECONDPART = PHRASE - End If - End Sub + End Function + End Class ''' - ''' SPLITS THE GIVEN PHRASE UP INTO TWO PARTS by dividing word SplitPhrase(Userinput, "and", - ''' Firstp, SecondP) + ''' The removal of commonly used words which are only used to create a sentence such as, + ''' the, on, in of, but ''' - ''' String: Sentence to be divided - ''' String: Word to divide sentence by - ''' String-Returned : firstpart of sentence to be populated - ''' String-Returned : Secondpart of sentence to be populated - ''' - - Public Sub SplitString(ByVal PHRASE As String, ByRef DIVIDINGWORD As String, ByRef FIRSTPART As String, ByRef SECONDPART As String) - Dim POS As Short - 'Check Error - If DIVIDINGWORD IsNot Nothing And PHRASE IsNot Nothing Then + Public Class RemoveStopWords - POS = InStr(PHRASE, DIVIDINGWORD) - If (POS > 0) Then - FIRSTPART = Trim(Left(PHRASE, POS - 1)) - SECONDPART = Trim(Right(PHRASE, Len(PHRASE) - POS - Len(DIVIDINGWORD) + 1)) - Else - FIRSTPART = "" - SECONDPART = PHRASE - End If - Else + Public StopWords As New List(Of String) - End If - End Sub + Public StopWordsArab() As String = {"،", "آض", "آمينَ", "آه", + "آهاً", "آي", "أ", "أب", "أجل", "أجمع", "أخ", "أخذ", "أصبح", "أضحى", "أقبل", + "أقل", "أكثر", "ألا", "أم", "أما", "أمامك", "أمامكَ", "أمسى", "أمّا", "أن", "أنا", "أنت", + "أنتم", "أنتما", "أنتن", "أنتِ", "أنشأ", "أنّى", "أو", "أوشك", "أولئك", "أولئكم", "أولاء", + "أولالك", "أوّهْ", "أي", "أيا", "أين", "أينما", "أيّ", "أَنَّ", "أََيُّ", "أُفٍّ", "إذ", "إذا", "إذاً", + "إذما", "إذن", "إلى", "إليكم", "إليكما", "إليكنّ", "إليكَ", "إلَيْكَ", "إلّا", "إمّا", "إن", "إنّما", + "إي", "إياك", "إياكم", "إياكما", "إياكن", "إيانا", "إياه", "إياها", "إياهم", "إياهما", "إياهن", + "إياي", "إيهٍ", "إِنَّ", "ا", "ابتدأ", "اثر", "اجل", "احد", "اخرى", "اخلولق", "اذا", "اربعة", "ارتدّ", + "استحال", "اطار", "اعادة", "اعلنت", "اف", "اكثر", "اكد", "الألاء", "الألى", "الا", "الاخيرة", "الان", "الاول", + "الاولى", "التى", "التي", "الثاني", "الثانية", "الذاتي", "الذى", "الذي", "الذين", "السابق", "الف", "اللائي", + "اللاتي", "اللتان", "اللتيا", "اللتين", "اللذان", "اللذين", "اللواتي", "الماضي", "المقبل", "الوقت", "الى", + "اليوم", "اما", "امام", "امس", "ان", "انبرى", "انقلب", "انه", "انها", "او", "اول", "اي", "ايار", "ايام", + "ايضا", "ب", "بات", "باسم", "بان", "بخٍ", "برس", "بسبب", "بسّ", "بشكل", "بضع", "بطآن", "بعد", "بعض", "بك", + "بكم", "بكما", "بكن", "بل", "بلى", "بما", "بماذا", "بمن", "بن", "بنا", "به", "بها", "بي", "بيد", "بين", + "بَسْ", "بَلْهَ", "بِئْسَ", "تانِ", "تانِك", "تبدّل", "تجاه", "تحوّل", "تلقاء", "تلك", "تلكم", "تلكما", "تم", "تينك", + "تَيْنِ", "تِه", "تِي", "ثلاثة", "ثم", "ثمّ", "ثمّة", "ثُمَّ", "جعل", "جلل", "جميع", "جير", "حار", "حاشا", "حاليا", + "حاي", "حتى", "حرى", "حسب", "حم", "حوالى", "حول", "حيث", "حيثما", "حين", "حيَّ", "حَبَّذَا", "حَتَّى", "حَذارِ", "خلا", + "خلال", "دون", "دونك", "ذا", "ذات", "ذاك", "ذانك", "ذانِ", "ذلك", "ذلكم", "ذلكما", "ذلكن", "ذو", "ذوا", "ذواتا", "ذواتي", "ذيت", "ذينك", "ذَيْنِ", "ذِه", "ذِي", "راح", "رجع", "رويدك", "ريث", "رُبَّ", "زيارة", "سبحان", "سرعان", "سنة", "سنوات", "سوف", "سوى", "سَاءَ", "سَاءَمَا", "شبه", "شخصا", "شرع", "شَتَّانَ", "صار", "صباح", "صفر", "صهٍ", "صهْ", "ضد", "ضمن", "طاق", "طالما", "طفق", "طَق", "ظلّ", "عاد", "عام", "عاما", "عامة", "عدا", "عدة", "عدد", "عدم", "عسى", "عشر", "عشرة", "علق", "على", "عليك", "عليه", "عليها", "علًّ", "عن", "عند", "عندما", "عوض", "عين", "عَدَسْ", "عَمَّا", "غدا", "غير", "ـ", "ف", "فان", "فلان", "فو", "فى", "في", "فيم", "فيما", "فيه", "فيها", "قال", "قام", "قبل", "قد", "قطّ", "قلما", "قوة", "كأنّما", "كأين", "كأيّ", "كأيّن", "كاد", "كان", "كانت", "كذا", "كذلك", "كرب", "كل", "كلا", "كلاهما", "كلتا", "كلم", "كليكما", "كليهما", "كلّما", "كلَّا", "كم", "كما", "كي", "كيت", "كيف", "كيفما", "كَأَنَّ", "كِخ", "لئن", "لا", "لات", "لاسيما", "لدن", "لدى", "لعمر", "لقاء", "لك", "لكم", "لكما", "لكن", "لكنَّما", "لكي", "لكيلا", "للامم", "لم", "لما", "لمّا", "لن", "لنا", "له", "لها", "لو", "لوكالة", "لولا", "لوما", "لي", "لَسْتَ", "لَسْتُ", "لَسْتُم", "لَسْتُمَا", "لَسْتُنَّ", "لَسْتِ", "لَسْنَ", "لَعَلَّ", "لَكِنَّ", "لَيْتَ", "لَيْسَ", "لَيْسَا", "لَيْسَتَا", "لَيْسَتْ", "لَيْسُوا", "لَِسْنَا", "ما", "ماانفك", "مابرح", "مادام", "ماذا", "مازال", "مافتئ", "مايو", "متى", "مثل", "مذ", "مساء", "مع", "معاذ", "مقابل", "مكانكم", "مكانكما", "مكانكنّ", "مكانَك", "مليار", "مليون", "مما", "ممن", "من", "منذ", "منها", "مه", "مهما", "مَنْ", "مِن", "نحن", "نحو", "نعم", "نفس", "نفسه", "نهاية", "نَخْ", "نِعِمّا", "نِعْمَ", "ها", "هاؤم", "هاكَ", "هاهنا", "هبّ", "هذا", "هذه", "هكذا", "هل", "هلمَّ", "هلّا", "هم", "هما", "هن", "هنا", "هناك", "هنالك", "هو", "هي", "هيا", "هيت", "هيّا", "هَؤلاء", "هَاتانِ", "هَاتَيْنِ", "هَاتِه", "هَاتِي", "هَجْ", "هَذا", "هَذانِ", "هَذَيْنِ", "هَذِه", "هَذِي", "هَيْهَاتَ", "و", "و6", "وا", "واحد", "واضاف", "واضافت", "واكد", "وان", "واهاً", "واوضح", "وراءَك", "وفي", "وقال", "وقالت", "وقد", "وقف", "وكان", "وكانت", "ولا", "ولم", + "ومن", "وهو", "وهي", "ويكأنّ", "وَيْ", "وُشْكَانََ", "يكون", "يمكن", "يوم", "ّأيّان"} - ''' - ''' Split string to List of strings - ''' - ''' base string - ''' to be seperated by - ''' - - Public Function SplitToList(ByRef Str As String, ByVal Seperator As String) As List(Of String) - Dim lst As New List(Of String) - If Str <> "" = True And Seperator <> "" Then - - Dim Found() As String = Str.Split(Seperator) - For Each item In Found - lst.Add(item) - Next - Else + Public StopWordsDutch() As String = {"aan", "achte", "achter", "af", "al", "alle", "alleen", "alles", "als", "ander", "anders", "beetje", + "behalve", "beide", "beiden", "ben", "beneden", "bent", "bij", "bijna", "bijv", "blijkbaar", "blijken", "boven", "bv", + "daar", "daardoor", "daarin", "daarna", "daarom", "daaruit", "dan", "dat", "de", "deden", "deed", "derde", "derhalve", "dertig", + "deze", "dhr", "die", "dit", "doe", "doen", "doet", "door", "drie", "duizend", "echter", "een", "eens", "eerst", "eerste", "eigen", + "eigenlijk", "elk", "elke", "en", "enige", "er", "erg", "ergens", "etc", "etcetera", "even", "geen", "genoeg", "geweest", "haar", + "haarzelf", "had", "hadden", "heb", "hebben", "hebt", "hedden", "heeft", "heel", "hem", "hemzelf", "hen", "het", "hetzelfde", + "hier", "hierin", "hierna", "hierom", "hij", "hijzelf", "hoe", "honderd", "hun", "ieder", "iedere", "iedereen", "iemand", "iets", + "ik", "in", "inderdaad", "intussen", "is", "ja", "je", "jij", "jijzelf", "jou", "jouw", "jullie", "kan", "kon", "konden", "kun", + "kunnen", "kunt", "laatst", "later", "lijken", "lijkt", "maak", "maakt", "maakte", "maakten", "maar", "mag", "maken", "me", "meer", + "meest", "meestal", "men", "met", "mevr", "mij", "mijn", "minder", "miss", "misschien", "missen", "mits", "mocht", "mochten", + "moest", "moesten", "moet", "moeten", "mogen", "mr", "mrs", "mw", "na", "naar", "nam", "namelijk", "nee", "neem", "negen", + "nemen", "nergens", "niemand", "niet", "niets", "niks", "noch", "nochtans", "nog", "nooit", "nu", "nv", "of", "om", "omdat", + "ondanks", "onder", "ondertussen", "ons", "onze", "onzeker", "ooit", "ook", "op", "over", "overal", "overige", "paar", "per", + "recent", "redelijk", "samen", "sinds", "steeds", "te", "tegen", "tegenover", "thans", "tien", "tiende", "tijdens", "tja", "toch", + "toe", "tot", "totdat", "tussen", "twee", "tweede", "u", "uit", "uw", "vaak", "van", "vanaf", "veel", "veertig", "verder", + "verscheidene", "verschillende", "via", "vier", "vierde", "vijf", "vijfde", "vijftig", "volgend", "volgens", "voor", "voordat", + "voorts", "waar", "waarom", "waarschijnlijk", "wanneer", "waren", "was", "wat", "we", "wederom", "weer", "weinig", "wel", "welk", + "welke", "werd", "werden", "werder", "whatever", "wie", "wij", "wijzelf", "wil", "wilden", "willen", "word", "worden", "wordt", "zal", + "ze", "zei", "zeker", "zelf", "zelfde", "zes", "zeven", "zich", "zij", "zijn", "zijzelf", "zo", "zoals", "zodat", "zou", "zouden", + "zulk", "zullen"} - End If - Return lst - End Function + Public StopWordsENG() As String = {"a", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", + "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", + "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", + "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", + "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", + "but", "by", "c", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", + "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", + "course", "currently", "d", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", + "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", + "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", + "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", + "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", + "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", + "hither", "hopefully", "how", "howbeit", "however", "i", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", + "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "j", + "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", + "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", + "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", + "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", + "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", + "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", + "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", + "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", + "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", + "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", + "still", "sub", "such", "sup", "sure", "t", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", + "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", + "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", + "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", + "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", + "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", + "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", + "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", + "with", "within", "without", "wont", "wonder", "would", "wouldnt", "x", "y", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", + "yourself", "yourselves", "youll", "z", "zero"} - ''' - ''' Returns a delimited string from the list. - ''' - ''' - ''' - ''' - - Public Function ToDelimitedString(ls As List(Of String), delimiter As String) As String - Dim sb As New StringBuilder - For Each buf As String In ls - sb.Append(buf) - sb.Append(delimiter) - Next - Return sb.ToString.Trim(CChar(delimiter)) - End Function + Public StopWordsFrench() As String = {"a", "abord", "absolument", "afin", "ah", "ai", "aie", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", + "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", + "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "auraient", "aurait", "auront", "aussi", "autre", "autrefois", "autrement", + "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "avons", "ayant", "b", + "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", + "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", + "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", + "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", + "clac", "clic", "combien", "comme", "comment", "comparable", "comparables", "compris", "concernant", "contre", "couic", "crac", "d", + "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", + "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", + "different", "differentes", "differents", "différent", "différente", "différentes", "différents", "dire", "directe", "directement", + "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", + "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "dès", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", + "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "est", "et", "etant", "etc", + "etre", "eu", "euh", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "f", "fais", "faisaient", "faisant", "fait", + "façon", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", + "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", + "juste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", + "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m", "ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", + "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "minimale", "moi", "moi-meme", "moi-même", + "moindres", "moins", "mon", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", + "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notamment", "notre", + "nous", "nous-mêmes", "nouveau", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", + "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "o|", "où", "p", "paf", "pan", "par", "parce", + "parfois", "parle", "parlent", "parler", "parmi", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas", + "passé", "pendant", "pense", "permet", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "plein", + "plouf", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", + "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", + "procedant", "proche", "près", "psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", + "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", + "quelqu'un", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", + "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent", "restrictif", "retour", "revoici", + "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", + "semble", "semblent", "sent", "sept", "septième", "sera", "seraient", "serait", "seront", "ses", "seul", "seule", "seulement", "si", + "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "souvent", + "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", + "suivant", "suivante", "suivantes", "suivants", "suivre", "superpose", "sur", "surtout", "t", "ta", "tac", "tant", "tardive", "te", + "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir", "tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", + "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", + "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", + "uniques", "uns", "v", "va", "vais", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", + "vont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", + "étais", "était", "étant", "été", "être", "ô"} - ''' - ''' Convert object to Json String - ''' - ''' - ''' - - Public Function ToJson(ByRef Item As Object) As String - Dim Converter As New JavaScriptSerializer - Return Converter.Serialize(Item) + Public StopWordsItalian() As String = {"IE", "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti", "ad", "adesso", "affinche", "agl", "agli", + "ahime", "ahimè", "ai", "al", "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora", "altri", "altrimenti", "altro", + "altrove", "altrui", "anche", "ancora", "anni", "anno", "ansa", "anticipo", "assai", "attesa", "attraverso", "avanti", "avemmo", + "avendo", "avente", "aver", "avere", "averlo", "avesse", "avessero", "avessi", "avessimo", "aveste", "avesti", "avete", "aveva", + "avevamo", "avevano", "avevate", "avevi", "avevo", "avrai", "avranno", "avrebbe", "avrebbero", "avrei", "avremmo", "avremo", + "avreste", "avresti", "avrete", "avrà", "avrò", "avuta", "avute", "avuti", "avuto", "basta", "bene", "benissimo", "berlusconi", + "brava", "bravo", "c", "casa", "caso", "cento", "certa", "certe", "certi", "certo", "che", "chi", "chicchessia", "chiunque", "ci", + "ciascuna", "ciascuno", "cima", "cio", "cioe", "cioè", "circa", "citta", "città", "ciò", "co", "codesta", "codesti", "codesto", + "cogli", "coi", "col", "colei", "coll", "coloro", "colui", "come", "cominci", "comunque", "con", "concernente", "conciliarsi", + "conclusione", "consiglio", "contro", "cortesia", "cos", "cosa", "cosi", "così", "cui", "d", "da", "dagl", "dagli", "dai", "dal", + "dall", "dalla", "dalle", "dallo", "dappertutto", "davanti", "degl", "degli", "dei", "del", "dell", "della", "delle", "dello", + "dentro", "detto", "deve", "di", "dice", "dietro", "dire", "dirimpetto", "diventa", "diventare", "diventato", "dopo", "dov", "dove", + "dovra", "dovrà", "dovunque", "due", "dunque", "durante", "e", "ebbe", "ebbero", "ebbi", "ecc", "ecco", "ed", "effettivamente", "egli", + "ella", "entrambi", "eppure", "era", "erano", "eravamo", "eravate", "eri", "ero", "esempio", "esse", "essendo", "esser", "essere", + "essi", "ex", "fa", "faccia", "facciamo", "facciano", "facciate", "faccio", "facemmo", "facendo", "facesse", "facessero", "facessi", + "facessimo", "faceste", "facesti", "faceva", "facevamo", "facevano", "facevate", "facevi", "facevo", "fai", "fanno", "farai", + "faranno", "fare", "farebbe", "farebbero", "farei", "faremmo", "faremo", "fareste", "faresti", "farete", "farà", "farò", "fatto", + "favore", "fece", "fecero", "feci", "fin", "finalmente", "finche", "fine", "fino", "forse", "forza", "fosse", "fossero", "fossi", + "fossimo", "foste", "fosti", "fra", "frattempo", "fu", "fui", "fummo", "fuori", "furono", "futuro", "generale", "gia", "giacche", + "giorni", "giorno", "già", "gli", "gliela", "gliele", "glieli", "glielo", "gliene", "governo", "grande", "grazie", "gruppo", "ha", + "haha", "hai", "hanno", "ho", "i", "ieri", "il", "improvviso", "in", "inc", "infatti", "inoltre", "insieme", "intanto", "intorno", + "invece", "io", "l", "la", "lasciato", "lato", "lavoro", "le", "lei", "li", "lo", "lontano", "loro", "lui", "lungo", "luogo", "là", + "ma", "macche", "magari", "maggior", "mai", "male", "malgrado", "malissimo", "mancanza", "marche", "me", "medesimo", "mediante", + "meglio", "meno", "mentre", "mesi", "mezzo", "mi", "mia", "mie", "miei", "mila", "miliardi", "milioni", "minimi", "ministro", + "mio", "modo", "molti", "moltissimo", "molto", "momento", "mondo", "mosto", "nazionale", "ne", "negl", "negli", "nei", "nel", + "nell", "nella", "nelle", "nello", "nemmeno", "neppure", "nessun", "nessuna", "nessuno", "niente", "no", "noi", "non", "nondimeno", + "nonostante", "nonsia", "nostra", "nostre", "nostri", "nostro", "novanta", "nove", "nulla", "nuovo", "o", "od", "oggi", "ogni", + "ognuna", "ognuno", "oltre", "oppure", "ora", "ore", "osi", "ossia", "ottanta", "otto", "paese", "parecchi", "parecchie", + "parecchio", "parte", "partendo", "peccato", "peggio", "per", "perche", "perchè", "perché", "percio", "perciò", "perfino", "pero", + "persino", "persone", "però", "piedi", "pieno", "piglia", "piu", "piuttosto", "più", "po", "pochissimo", "poco", "poi", "poiche", + "possa", "possedere", "posteriore", "posto", "potrebbe", "preferibilmente", "presa", "press", "prima", "primo", "principalmente", + "probabilmente", "proprio", "puo", "pure", "purtroppo", "può", "qualche", "qualcosa", "qualcuna", "qualcuno", "quale", "quali", + "qualunque", "quando", "quanta", "quante", "quanti", "quanto", "quantunque", "quasi", "quattro", "quel", "quella", "quelle", + "quelli", "quello", "quest", "questa", "queste", "questi", "questo", "qui", "quindi", "realmente", "recente", "recentemente", + "registrazione", "relativo", "riecco", "salvo", "sara", "sarai", "saranno", "sarebbe", "sarebbero", "sarei", "saremmo", "saremo", + "sareste", "saresti", "sarete", "sarà", "sarò", "scola", "scopo", "scorso", "se", "secondo", "seguente", "seguito", "sei", "sembra", + "sembrare", "sembrato", "sembri", "sempre", "senza", "sette", "si", "sia", "siamo", "siano", "siate", "siete", "sig", "solito", + "solo", "soltanto", "sono", "sopra", "sotto", "spesso", "srl", "sta", "stai", "stando", "stanno", "starai", "staranno", "starebbe", + "starebbero", "starei", "staremmo", "staremo", "stareste", "staresti", "starete", "starà", "starò", "stata", "state", "stati", + "stato", "stava", "stavamo", "stavano", "stavate", "stavi", "stavo", "stemmo", "stessa", "stesse", "stessero", "stessi", "stessimo", + "stesso", "steste", "stesti", "stette", "stettero", "stetti", "stia", "stiamo", "stiano", "stiate", "sto", "su", "sua", "subito", + "successivamente", "successivo", "sue", "sugl", "sugli", "sui", "sul", "sull", "sulla", "sulle", "sullo", "suo", "suoi", "tale", + "tali", "talvolta", "tanto", "te", "tempo", "ti", "titolo", "torino", "tra", "tranne", "tre", "trenta", "troppo", "trovato", "tu", + "tua", "tue", "tuo", "tuoi", "tutta", "tuttavia", "tutte", "tutti", "tutto", "uguali", "ulteriore", "ultimo", "un", "una", "uno", + "uomo", "va", "vale", "vari", "varia", "varie", "vario", "verso", "vi", "via", "vicino", "visto", "vita", "voi", "volta", "volte", + "vostra", "vostre", "vostri", "vostro", "è"} - End Function + Public StopWordsSpanish() As String = {"a", "actualmente", "acuerdo", "adelante", "ademas", "además", "adrede", "afirmó", "agregó", "ahi", "ahora", + "ahí", "al", "algo", "alguna", "algunas", "alguno", "algunos", "algún", "alli", "allí", "alrededor", "ambos", "ampleamos", + "antano", "antaño", "ante", "anterior", "antes", "apenas", "aproximadamente", "aquel", "aquella", "aquellas", "aquello", + "aquellos", "aqui", "aquél", "aquélla", "aquéllas", "aquéllos", "aquí", "arriba", "arribaabajo", "aseguró", "asi", "así", + "atras", "aun", "aunque", "ayer", "añadió", "aún", "b", "bajo", "bastante", "bien", "breve", "buen", "buena", "buenas", "bueno", + "buenos", "c", "cada", "casi", "cerca", "cierta", "ciertas", "cierto", "ciertos", "cinco", "claro", "comentó", "como", "con", + "conmigo", "conocer", "conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen", "consigues", + "contigo", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "cuatro", + "cuenta", "cuál", "cuáles", "cuándo", "cuánta", "cuántas", "cuánto", "cuántos", "cómo", "d", "da", "dado", "dan", "dar", "de", + "debajo", "debe", "deben", "debido", "decir", "dejó", "del", "delante", "demasiado", "demás", "dentro", "deprisa", "desde", + "despacio", "despues", "después", "detras", "detrás", "dia", "dias", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", + "dijeron", "dijo", "dio", "donde", "dos", "durante", "día", "días", "dónde", "e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", + "embargo", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "enfrente", "enseguida", "entonces", + "entre", "era", "eramos", "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos", "esta", "estaba", "estaban", "estado", + "estados", "estais", "estamos", "estan", "estar", "estará", "estas", "este", "esto", "estos", "estoy", "estuvo", "está", "están", "ex", + "excepto", "existe", "existen", "explicó", "expresó", "f", "fin", "final", "fue", "fuera", "fueron", "fui", "fuimos", "g", "general", + "gran", "grandes", "gueno", "h", "ha", "haber", "habia", "habla", "hablan", "habrá", "había", "habían", "hace", "haceis", "hacemos", + "hacen", "hacer", "hacerlo", "haces", "hacia", "haciendo", "hago", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", + "hizo", "horas", "hoy", "hubo", "i", "igual", "incluso", "indicó", "informo", "informó", "intenta", "intentais", "intentamos", "intentan", + "intentar", "intentas", "intento", "ir", "j", "junto", "k", "l", "la", "lado", "largo", "las", "le", "lejos", "les", "llegó", "lleva", + "llevar", "lo", "los", "luego", "lugar", "m", "mal", "manera", "manifestó", "mas", "mayor", "me", "mediante", "medio", "mejor", "mencionó", + "menos", "menudo", "mi", "mia", "mias", "mientras", "mio", "mios", "mis", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", + "muchas", "mucho", "muchos", "muy", "más", "mí", "mía", "mías", "mío", "míos", "n", "nada", "nadie", "ni", "ninguna", "ningunas", "ninguno", + "ningunos", "ningún", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", + "nuevos", "nunca", "o", "ocho", "os", "otra", "otras", "otro", "otros", "p", "pais", "para", "parece", "parte", "partir", "pasada", + "pasado", "paìs", "peor", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos", "poder", "podria", "podriais", + "podriamos", "podrian", "podrias", "podrá", "podrán", "podría", "podrían", "poner", "por", "porque", "posible", "primer", "primera", + "primero", "primeros", "principalmente", "pronto", "propia", "propias", "propio", "propios", "proximo", "próximo", "próximos", "pudo", + "pueda", "puede", "pueden", "puedo", "pues", "q", "qeu", "que", "quedó", "queremos", "quien", "quienes", "quiere", "quiza", "quizas", + "quizá", "quizás", "quién", "quiénes", "qué", "r", "raras", "realizado", "realizar", "realizó", "repente", "respecto", "s", "sabe", + "sabeis", "sabemos", "saben", "saber", "sabes", "salvo", "se", "sea", "sean", "segun", "segunda", "segundo", "según", "seis", "ser", + "sera", "será", "serán", "sería", "señaló", "si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", + "sois", "sola", "solamente", "solas", "solo", "solos", "somos", "son", "soy", "soyos", "su", "supuesto", "sus", "suya", "suyas", "suyo", + "sé", "sí", "sólo", "t", "tal", "tambien", "también", "tampoco", "tan", "tanto", "tarde", "te", "temprano", "tendrá", "tendrán", "teneis", + "tenemos", "tener", "tenga", "tengo", "tenido", "tenía", "tercera", "ti", "tiempo", "tiene", "tienen", "toda", "todas", "todavia", + "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", + "trata", "través", "tres", "tu", "tus", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "tú", "u", "ultimo", "un", "una", "unas", "uno", "unos", + "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "usted", "ustedes", "v", "va", "vais", "valor", "vamos", "van", "varias", "varios", + "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez", "vosotras", "vosotros", "voy", "vuestra", "vuestras", "vuestro", + "vuestros", "w", "x", "y", "ya", "yo", "z", "él", "ésa", "ésas", "ése", "ésos", "ésta", "éstas", "éste", "éstos", "última", "últimas", + "último", "últimos"} - ''' - ''' Counts the vowels used (AEIOU) - ''' - ''' - ''' - ''' - - Public Function VowelCount(ByVal InputString As String) As Integer - Dim v(9) As String 'Declare an array of 10 elements 0 to 9 - Dim vcount As Short 'This variable will contain number of vowels - Dim flag As Integer - Dim strLen As Integer - Dim i As Integer - v(0) = "a" 'First element of array is assigned small a - v(1) = "i" - v(2) = "o" - v(3) = "u" - v(4) = "e" - v(5) = "A" 'Sixth element is assigned Capital A - v(6) = "I" - v(7) = "O" - v(8) = "U" - v(9) = "E" - strLen = Len(InputString) - - For flag = 1 To strLen 'It will get every letter of entered string and loop - 'will terminate when all letters have been examined - - For i = 0 To 9 'Takes every element of v(9) one by one - 'Check if current letter is a vowel - If Mid(InputString, flag, 1) = v(i) Then - vcount = vcount + 1 ' If letter is equal to vowel - 'then increment vcount by 1 - End If - Next i 'Consider next value of v(i) - Next flag 'Consider next letter of the enterd string + ''' + ''' Removes StopWords from sentence + ''' ARAB/ENG/DUTCH/FRENCH/SPANISH/ITALIAN + ''' Hopefully leaving just relevant words in the user sentence + ''' Currently Under Revision (takes too many words) + ''' + ''' + ''' + Public Function RemoveStopWords(ByRef Userinput As String) As String + ' Userinput = LCase(Userinput).Replace("the", "r") + For Each item In StopWordsENG + Userinput = LCase(Userinput).Replace(item, "") + Next + For Each item In StopWordsArab + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsDutch + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsFrench + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsItalian + Userinput = Userinput.Replace(item, "") + Next + For Each item In StopWordsSpanish + Userinput = Userinput.Replace(item, "") + Next + Return Userinput + End Function - VowelCount = vcount + ''' + ''' Removes Stop words given a list of stop words + ''' + ''' user input + ''' stop word list + ''' + Public Function RemoveStopWords(ByRef Userinput As String, ByRef Lst As List(Of String)) As String + For Each item In Lst + Userinput = LCase(Userinput).Replace(item, "") + Next + Return Userinput + End Function - End Function + End Class - End Module + End Namespace Public Class RegexFilter diff --git a/SpydazWebAI_InputModeller.vbproj b/SpydazWebAI_InputModeller.vbproj index 69cf341..6454650 100644 --- a/SpydazWebAI_InputModeller.vbproj +++ b/SpydazWebAI_InputModeller.vbproj @@ -89,6 +89,7 @@ +