CyxWiz LogoCyxWiz
DocsText Processing

Text Processing Tools

Natural language processing tools for text analysis, tokenization, and feature extraction.

Available Tools

Tokenization
Word, sentence, subword tokenization
Text Cleaning
Normalization, stopwords, stemming
Vectorization
TF-IDF, word embeddings
Text Analysis
Sentiment, entities, keywords
Similarity
Document comparison, search

Tokenization

import cyxwiz.text as text

# Word tokenization
tokens = text.word_tokenize("Hello, world! How are you?")
# ['Hello', ',', 'world', '!', 'How', 'are', 'you', '?']

# Sentence tokenization
sentences = text.sent_tokenize(paragraph)

# Subword tokenization (BPE)
tokenizer = text.BPETokenizer(vocab_size=10000)
tokenizer.fit(corpus)
tokens = tokenizer.encode("Hello world")
decoded = tokenizer.decode(tokens)

Text Cleaning

# Lowercase
cleaned = text.lower(text_data)

# Remove punctuation
cleaned = text.remove_punctuation(text_data)

# Remove stopwords
cleaned = text.remove_stopwords(text_data, language='english')

# Stemming
stemmer = text.PorterStemmer()
stemmed = stemmer.stem("running")  # 'run'

# Lemmatization
lemmatizer = text.WordNetLemmatizer()
lemma = lemmatizer.lemmatize("better", pos='a')  # 'good'

# Pipeline
pipeline = text.Pipeline([
    text.lower,
    text.remove_punctuation,
    text.remove_stopwords,
    text.stem
])
cleaned = pipeline(text_data)

Vectorization

TF-IDF
from cyxwiz.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)
tfidf_matrix = vectorizer.fit_transform(
    documents
)

# Get feature names
features = vectorizer.get_feature_names()
Word Embeddings
from cyxwiz.text import Word2Vec

model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1
)

# Get word vector
vector = model.wv['machine']

# Find similar words
similar = model.wv.most_similar('king')

Text Analysis

# Sentiment analysis
sentiment = text.sentiment(text_data)
# {'positive': 0.8, 'negative': 0.1, 'neutral': 0.1}

# Keyword extraction
keywords = text.extract_keywords(text_data, top_n=10)

# Named Entity Recognition
entities = text.extract_entities(text_data)
# [('Apple', 'ORG'), ('Tim Cook', 'PERSON'), ('California', 'GPE')]

# N-grams
bigrams = text.ngrams(tokens, n=2)
trigrams = text.ngrams(tokens, n=3)

Text Similarity

# Cosine similarity
similarity = text.cosine_similarity(vec1, vec2)

# Document similarity matrix
sim_matrix = text.pairwise_similarity(documents)

# Fuzzy string matching
ratio = text.fuzz_ratio("hello world", "hello word")
# 91

# Semantic similarity (using embeddings)
sim = text.semantic_similarity(
    "machine learning is great",
    "deep learning is awesome"
)

Node Editor Integration

NodeInputsOutputs
TokenizeTextToken list
TF-IDFDocumentsSparse matrix
EmbedTokensEmbedding tensor
SentimentTextScore dict