Tokenise characters in SpaCy

08 Aug 2020

import spacy

nlp = spacy.load('en_core_web_md')

def tokenise(words):
    res = []
    
    for word in words:
        doc = nlp(word)
        
        for token in doc:
            res.append(token.text)
    return res

sample_words = ["hi and hello", "thank you and goodnight"]

result = tokenise(sample_words)

print(result)

['hi', 'and', 'hello', 'thank', 'you', 'and', 'goodnight']