# 编码转换示例
text = "示例文本".encode('gbk')  # 假设原始编码是GBK
text = text.decode('gbk').encode('utf-8')  # 转换为UTF-8


import re

# 移除HTML标签示例
text = "<p>这是一段<b>HTML</b>文本</p>"
clean_text = re.sub(r'<[^>]+>', '', text)
print(clean_text)  # 输出: 这是一段HTML文本

# 使用NLTK进行英文分词
from nltk.tokenize import word_tokenize

text = "Natural Language Processing is fascinating!"
tokens = word_tokenize(text)
print(tokens)  # ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']

# 使用jieba进行中文分词
import jieba

text = "自然语言处理非常有趣"
tokens = jieba.lcut(text)
print(tokens)  # ['自然语言', '处理', '非常', '有趣']

# 使用HuggingFace的tokenizer示例
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokens = tokenizer.tokenize("自然语言处理")
print(tokens)  # ['自', '然', '语', '言', '处', '理']

# 使用spaCy进行词性标注
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Natural Language Processing is fascinating!")
for token in doc:
    print(token.text, token.pos_)  # 输出每个词及其词性标签


from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())


from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print(tfidf_vectorizer.get_feature_names_out())
print(X_tfidf.toarray())


bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_bigram = bigram_vectorizer.fit_transform(corpus)
print(bigram_vectorizer.get_feature_names_out())

from gensim.models import Word2Vec

sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 获取词向量
vector = model.wv['cat']
# 找相似词
similar_words = model.wv.most_similar('cat')

from gensim.models import FastText

model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
# 即使单词不在词典中也能获得向量
vector = model.wv['unseenword']

from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)
vector = model.infer_vector(["new", "document", "text"])

# 使用Sentence-BERT
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)


from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
lda = LatentDirichletAllocation(n_components=2)
lda.fit(X)

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## 1. 文本预处理
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 移除特殊字符和数字
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 分词
    words = text.split()
    # 移除停用词
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # 词干提取
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)


## 实践示例：新闻分类

from sklearn.datasets import fetch_20newsgroups

# 选择4个类别作为示例
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

# 加载训练集和测试集
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print(f"训练集样本数: {len(newsgroups_train.data)}")
print(f"测试集样本数: {len(newsgroups_test.data)}")

from sklearn.feature_extraction.text import TfidfVectorizer

# 创建TF-IDF向量化器
vectorizer = TfidfVectorizer(max_features=5000)

# 转换训练集和测试集
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

y_train = newsgroups_train.target
y_test = newsgroups_test.target

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 创建并训练模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 预测测试集
y_pred = model.predict(X_test)

# 评估模型
print(f"准确率: {accuracy_score(y_test, y_pred):.2f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=newsgroups_test.target_names))

# 使用Scikit-learn实现情感分类
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# 构建分类管道
sentiment_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', LinearSVC())
])

# 训练模型
sentiment_clf.fit(train_texts, train_labels)

# 预测新文本
prediction = sentiment_clf.predict(["这个产品非常好用，强烈推荐！"])
print(prediction)  # 输出: 'positive'

# 基于BERT的方面级情感分析
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 加载预训练模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 准备输入
text = "餐厅的环境很棒，但服务太慢了。"
aspect = "服务"
inputs = tokenizer(f"[CLS] {aspect} [SEP] {text} [SEP]", return_tensors="pt")

# 预测情感
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1)
print(predictions)  # 可能输出: 1 (负面)

# 使用spaCy进行NER的简单示例
import spacy

# 加载英文模型
nlp = spacy.load("en_core_web_sm")

# 处理文本
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# 输出识别结果
for ent in doc.ents:
    print(ent.text, ent.label_)


# 基于规则的简单NER实现
import re

def rule_based_ner(text):
    # 匹配日期
    dates = re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', text)
    # 匹配货币
    currencies = re.findall(r'\$\d+\.?\d*', text)
    return {"日期": dates, "货币": currencies}

sample = "会议定于12/15/2023举行,预算为$5000"
print(rule_based_ner(sample))


# 示例：简单的规则匹配
import re

text = "马云创立了阿里巴巴"
pattern = r"(.+?)创立了(.+?)"
match = re.search(pattern, text)
if match:
    print(f"创始人: {match.group(1)}, 公司: {match.group(2)}")

# 示例：使用spaCy进行关系抽取
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Apple was founded by Steve Jobs in 1976."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

# 示例：使用HuggingFace Transformers
from transformers import pipeline

classifier = pipeline("text-classification", model="bert-base-uncased")
result = classifier("马云是阿里巴巴的创始人")
print(result)


# 示例数据集
data = [
    {"text": "比尔盖茨是微软的创始人", "relations": [{"head": "比尔盖茨", "tail": "微软", "type": "创始人"}]},
    {"text": "北京是中国的首都", "relations": [{"head": "北京", "tail": "中国", "type": "首都"}]}
]

from sklearn.feature_extraction.text import TfidfVectorizer

texts = [d["text"] for d in data]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

from sklearn.svm import SVC

# 简化示例，实际需要更复杂的标签处理
y = [d["relations"][0]["type"] for d in data]  
model = SVC()
model.fit(X, y)

test_text = "乔布斯创立了苹果公司"
test_vec = vectorizer.transform([test_text])
prediction = model.predict(test_vec)
print(f"预测关系: {prediction[0]}")


# 词袋模型(Bag of Words)  
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    '我喜欢自然语言处理',
    '我爱学习NLP技术',
    '文本相似度计算很有趣'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())


# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)
print(tfidf_matrix.toarray())

# Word2Vec 相似度
from gensim.models import Word2Vec

sentences = [
    ['我','喜欢','自然语言处理'],
    ['我','爱','学习','NLP','技术'],
    ['文本','相似度','计算','很','有趣']
]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vector = model.wv['自然语言处理']  # 获取词向量

# 句子向量计算
import numpy as np

def sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

sentence_vec1 = sentence_vector(['我','喜欢','自然语言处理'], model)
sentence_vec2 = sentence_vector(['我','爱','NLP'], model)

from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

inputs = tokenizer("这是一个示例句子", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state


from sklearn.metrics.pairwise import cosine_similarity

# 计算余弦相似度
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"文本相似度: {similarity[0][0]:.4f}")


# 新闻标题相似度检测
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 示例数据
titles = [
    "苹果发布新款iPhone手机",
    "苹果公司推出最新智能手机",
    "微软公布季度财报",
    "谷歌宣布新的人工智能计划"
]

# 计算相似度矩阵
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(titles)
similarities = cosine_similarity(tfidf_matrix)

# 显示结果
df = pd.DataFrame(similarities, columns=titles, index=titles)
print(df)


# 简单的 RNN 单元实现示例
import numpy as np

class SimpleRNN:
    def __init__(self, input_size, hidden_size):
        self.Wx = np.random.randn(hidden_size, input_size)  # 输入权重
        self.Wh = np.random.randn(hidden_size, hidden_size)  # 隐藏状态权重
        self.b = np.zeros((hidden_size, 1))  # 偏置项
    
    def forward(self, x, h_prev):
        h_next = np.tanh(np.dot(self.Wx, x) + np.dot(self.Wh, h_prev) + self.b)
        return h_next

# LSTM 单元的基本实现
class LSTMCell:
    def __init__(self, input_size, hidden_size):
        # 组合所有门的权重
        self.W = np.random.randn(4*hidden_size, input_size+hidden_size)
        self.b = np.random.randn(4*hidden_size, 1)

    def forward(self, x, h_prev, c_prev):
        combined = np.vstack((h_prev, x))
        gates = np.dot(self.W, combined) + self.b

        # 分割得到各个门
        f_gate = sigmoid(gates[:hidden_size])  # 遗忘门
        i_gate = sigmoid(gates[hidden_size:2*hidden_size])  # 输入门
        o_gate = sigmoid(gates[2*hidden_size:3*hidden_size])  # 输出门
        c_candidate = np.tanh(gates[3*hidden_size:])  # 候选记忆

        # 更新记忆和隐藏状态
        c_next = f_gate * c_prev + i_gate * c_candidate
        h_next = o_gate * np.tanh(c_next)

        return h_next, c_next

# GRU 单元的实现
class GRUCell:
    def __init__(self, input_size, hidden_size):
        self.W = np.random.randn(3*hidden_size, input_size+hidden_size)
        self.b = np.random.randn(3*hidden_size, 1)

    def forward(self, x, h_prev):
        combined = np.vstack((h_prev, x))
        gates = np.dot(self.W, combined) + self.b

        # 分割门控信号
        z = sigmoid(gates[:hidden_size])  # 更新门
        r = sigmoid(gates[hidden_size:2*hidden_size])  # 重置门
        h_candidate = np.tanh(np.dot(self.W[2*hidden_size:], 
                              np.vstack((r*h_prev, x))) + self.b[2*hidden_size:]

        # 更新隐藏状态
        h_next = (1-z)*h_prev + z*h_candidate
        return h_next

from tensorflow.keras.layers import Bidirectional, LSTM

model.add(Bidirectional(LSTM(64)))  # 创建双向LSTM层

# 简化的自注意力实现示例
import torch
import torch.nn.functional as F

def self_attention(query, key, value):
    scores = torch.matmul(query, key.transpose(-2, -1)) / (query.size(-1) ** 0.5)
    weights = F.softmax(scores, dim=-1)
    return torch.matmul(weights, value)

# 多头注意力实现示例
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, query, key, value):
        batch_size = query.size(0)

        # 线性变换并分割多头
        Q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k)
        K = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k)
        V = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k)

        # 计算注意力
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        weights = F.softmax(scores, dim=-1)
        output = torch.matmul(weights, V)

        # 拼接多头并输出
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

# 使用HuggingFace Transformers库调用BERT
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

# 获取注意力权重
attention = outputs.attentions  # 包含各层的注意力权重

import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleAttention(nn.Module):
    def __init__(self, hidden_size):
        super(SimpleAttention, self).__init__()
        self.attention = nn.Linear(hidden_size, 1)

    def forward(self, encoder_outputs):
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        attention_scores = self.attention(encoder_outputs).squeeze(2)  # [batch_size, seq_len]
        attention_weights = F.softmax(attention_scores, dim=1)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # [batch_size, 1, hidden_size]
        return context_vector.squeeze(1), attention_weights

import matplotlib.pyplot as plt
import seaborn as sns

def plot_attention(attention_weights, source_tokens, target_tokens):
    plt.figure(figsize=(10, 8))
    sns.heatmap(attention_weights, 
                xticklabels=source_tokens,
                yticklabels=target_tokens,
                cmap="YlGnBu")
    plt.xlabel("Source Tokens")
    plt.ylabel("Target Tokens")
    plt.title("Attention Weights Visualization")
    plt.show()

# 示例使用
source = ["The", "cat", "sat", "on", "the", "mat"]
target = ["Le", "chat", "s'est", "assis", "sur", "le", "tapis"]
attention = torch.rand(7, 6)  # 模拟的注意力权重
plot_attention(attention, source, target)

import nltk
nltk.download('punkt')  # 下载必要的数据包

# 示例：文本分词
from nltk.tokenize import word_tokenize
text = "Natural language processing is fascinating."
tokens = word_tokenize(text)
print(tokens)  # 输出: ['Natural', 'language', 'processing', 'is', 'fascinating', '.']

# 安装英文模型: python -m spacy download en_core_web_sm
# 安装中文模型: python -m spacy download zh_core_web_sm

import spacy

# 加载英文模型
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# 提取命名实体
for ent in doc.ents:
    print(ent.text, ent.label_)
# 输出: Apple ORG
#       U.K. GPE
#       $1 billion MONEY

import jieba

# 精确模式分词
seg_list = jieba.cut("我爱自然语言处理", cut_all=False)
print("精确模式: " + "/".join(seg_list)) 
# 输出: 精确模式: 我/爱/自然语言/处理

# 添加自定义词典
jieba.load_userdict("userdict.txt")  # 自定义词典文件

from hanlp import HanLP

# 分词示例
print(HanLP.segment('你好，欢迎使用HanLP！'))
# 输出: [你好/vl, ，/w, 欢迎/v, 使用/v, HanLP/nx, ！/w]

# 依存句法分析
sentence = HanLP.parseDependency("我爱自然语言处理")
print(sentence)

# 结合多个工具的中文文本处理流程
import jieba
from hanlp import HanLP
import spacy

text = "自然语言处理是人工智能的重要分支，近年来发展迅速。"

# 1. 使用jieba分词
words = list(jieba.cut(text))
print("分词结果:", words)

# 2. 使用HanLP进行词性标注
print("\n词性标注:")
print(HanLP.segment(text))

# 3. 使用spaCy的英文模型处理英文部分
nlp = spacy.load("en_core_web_sm")
doc = nlp("Natural Language Processing is amazing.")
print("\n英文实体识别:")
for ent in doc.ents:
    print(ent.text, ent.label_)

字符类型	处理方法	应用场景
HTML标签	正则表达式移除	网页爬取文本
表情符号	移除或转换为文字描述	社交媒体分析
控制字符	过滤掉	所有文本处理
特殊标点	标准化处理	文本规范化

工具名称	支持语言	特点	适用场景
NLTK	英文为主	功能全面，速度一般	教学、研究
spaCy	多语言	工业级，速度快	生产环境
jieba	中文	简单易用，词典可扩展	中文处理
Stanford CoreNLP	多语言	准确度高，资源消耗大	学术研究
HuggingFace Tokenizers	多语言	支持子词分词	深度学习

特性	Word2Vec	GloVe
训练方式	局部窗口	全局统计
计算效率	较高	较低
小数据集表现	较好	一般
大数据集表现	好	更好

模型	发布时间	主要特点
Word2Vec	2013	静态词向量
GloVe	2014	全局统计+局部窗口
ELMo	2018	双向LSTM，上下文相关
BERT	2018	Transformer，双向上下文
GPT-3	2020	单向Transformer，生成能力强

方法	描述	优点	缺点
词袋模型(BoW)	统计词频	简单直观	忽略词序和语义
TF-IDF	考虑词的重要性	比BoW更精确	仍然忽略上下文
Word2Vec	词向量表示	捕捉语义关系	无法处理多义词
BERT	上下文嵌入	最先进的表示	计算资源要求高

方法名称	特点
余弦相似度	忽略向量长度，专注方向
欧氏距离	考虑向量绝对位置
曼哈顿距离	对异常值不敏感
Jaccard相似度	适用于集合相似度

组件	功能
输入门	控制新信息的流入
遗忘门	决定丢弃哪些旧信息
输出门	控制输出的信息量
记忆单元	保存长期状态

组件	功能
更新门	决定保留多少旧信息
重置门	决定如何组合新旧信息
候选激活	基于重置门计算的新状态

优点	缺点
功能全面，覆盖 NLP 主要任务	执行效率较低
文档完善，学习资源丰富	需要额外下载数据包
适合教学和研究	对中文支持有限

语言模型¶

统计原理¶

神经网络模型¶

概率语法模型¶

依存语法模型¶

文本预处理¶

文本清洗¶

分词¶

词性标注¶

文本表示方法¶

传统文本表示¶

词袋模型（Bag of Words）¶

TF-IDF¶

N-gram 模型¶

词向量表示¶

Word2Vec¶

GloVe 词向量¶

FastText¶

上下文感知的表示¶

ELMo 模型¶

BERT 及其变体¶

文档级表示¶

Doc2Vec¶

句向量与文档向量¶

主题模型（LDA）¶

文本分类¶

流程与方法¶

示例¶

情感分析¶

基于词典的情感分析方法¶

基于机器学习的情感分析方法¶

细粒度情感分析¶

命名实体识别¶

评估指标¶

关系抽取¶

主要方法¶

评估指标¶

示例¶

文本相似度计算¶

主要方法¶

相似度度量指标¶

示例¶

神经网络¶

循环神经网络（RNN）¶

长短期记忆网络（LSTM）¶

门控循环单元（GRU）¶

双向 RNN（Bi-RNN）¶

注意力机制¶

自注意力机制¶

多头注意力¶

案例：BERT中的注意力¶

练习1：实现基础注意力机制¶

练习2：可视化注意力权重¶

Transformer架构¶

核心思想¶

Python NLP 生态¶

NLTK¶

spaCy¶

jieba¶

HanLP¶

案例：中文文本分析流程¶