diff --git "a/TimesNet_\345\274\202\345\270\270\346\243\200\346\265\213\345\256\214\346\225\264\350\256\255\347\273\203\346\265\201\347\250\213.md" "b/TimesNet_\345\274\202\345\270\270\346\243\200\346\265\213\345\256\214\346\225\264\350\256\255\347\273\203\346\265\201\347\250\213.md" new file mode 100644 index 000000000..ff6b97a5d --- /dev/null +++ "b/TimesNet_\345\274\202\345\270\270\346\243\200\346\265\213\345\256\214\346\225\264\350\256\255\347\273\203\346\265\201\347\250\213.md" @@ -0,0 +1,739 @@ +# TimesNet异常检测任务完整训练流程详解 + +本文档详细记录了TimesNet模型在异常检测任务中从脚本启动到训练完成的完整函数调用流程。 + +## 📋 目录 + +- [阶段一:程序启动与初始化](#阶段一程序启动与初始化) +- [阶段二:实验类与模型初始化](#阶段二实验类与模型初始化) +- [阶段三:数据加载初始化](#阶段三数据加载初始化) +- [阶段四:训练过程](#阶段四训练过程) +- [阶段五:验证过程](#阶段五验证过程) +- [阶段六:测试与异常检测](#阶段六测试与异常检测) +- [完整调用链总结](#完整调用链总结) + +--- + +## 🚀 阶段一:程序启动与初始化 + +### 1.1 脚本启动 + +```bash +# 典型的异常检测任务启动命令 +python -u run.py \ + --task_name anomaly_detection \ + --is_training 1 \ + --root_path ./dataset/PSM \ + --model_id PSM \ + --model TimesNet \ + --data PSM \ + --features M \ + --seq_len 100 \ + --pred_len 0 \ + --d_model 64 \ + --d_ff 64 \ + --e_layers 2 \ + --enc_in 25 \ + --c_out 25 \ + --top_k 3 \ + --anomaly_ratio 1 \ + --batch_size 128 \ + --train_epochs 3 +``` + +### 1.2 程序入口 (`run.py`) + +```python +# run.py:14-240 +if __name__ == '__main__': + # 1. 设置随机种子 + fix_seed = 2021 + random.seed(fix_seed) + torch.manual_seed(fix_seed) + np.random.seed(fix_seed) + + # 2. 解析命令行参数 + parser = argparse.ArgumentParser(description='TimesNet') + args = parser.parse_args() + + # 3. 设备配置 + if torch.cuda.is_available() and args.use_gpu: + args.device = torch.device('cuda:{}'.format(args.gpu)) + print('Using GPU') + + # 4. 打印参数 + print_args(args) + + # 5. 选择实验类 - 关键决策点 + if args.task_name == 'anomaly_detection': + Exp = Exp_Anomaly_Detection # ← 选择异常检测实验类 + + # 6. 开始训练循环 + if args.is_training: + for ii in range(args.itr): + exp = Exp(args) # ← 创建实验实例,触发初始化链 + setting = '实验名称设置...' + print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting)) + exp.train(setting) # ← 开始训练 + print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting)) + exp.test(setting) # ← 开始测试 +``` + +--- + +## 🏗️ 阶段二:实验类与模型初始化 + +### 2.1 实验类初始化 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:20-22 +class Exp_Anomaly_Detection(Exp_Basic): + def __init__(self, args): + super(Exp_Anomaly_Detection, self).__init__(args) # ← 调用父类初始化 +``` + +### 2.2 基础实验类初始化 (`exp/exp_basic.py`) + +```python +# exp_basic.py:10-41 +class Exp_Basic(object): + def __init__(self, args): + self.args = args + + # 2.2.1 注册模型字典 + self.model_dict = { + 'TimesNet': TimesNet, # ← 注册TimesNet模型 + 'Autoformer': Autoformer, + 'Transformer': Transformer, + # ... 其他模型 + } + + # 2.2.2 获取计算设备 + self.device = self._acquire_device() + + # 2.2.3 构建模型并移动到设备 - 关键步骤 + self.model = self._build_model().to(self.device) # ← 构建TimesNet模型 +``` + +### 2.3 模型构建 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:24-28 +def _build_model(self): + # 通过模型字典获取TimesNet类并实例化 + model = self.model_dict[self.args.model].Model(self.args).float() # ← 调用TimesNet.__init__() + + # 多GPU支持 + if self.args.use_multi_gpu and self.args.use_gpu: + model = nn.DataParallel(model, device_ids=self.args.device_ids) + return model +``` + +### 2.4 TimesNet模型初始化 (`models/TimesNet.py`) + +```python +# TimesNet.py:76-101 +def __init__(self, configs): + super(Model, self).__init__() + + # 2.4.1 保存配置参数 + self.configs = configs + self.task_name = configs.task_name # = 'anomaly_detection' + self.seq_len = configs.seq_len # 输入序列长度 + self.label_len = configs.label_len + self.pred_len = configs.pred_len # 异常检测为0 + + # 2.4.2 构建TimesBlock模块列表 - 核心组件 + self.model = nn.ModuleList([TimesBlock(configs) for _ in range(configs.e_layers)]) + + # 2.4.3 构建数据嵌入层 + self.enc_embedding = DataEmbedding( + configs.enc_in, # 输入特征数 + configs.d_model, # 嵌入维度 + configs.embed, # 嵌入类型 + configs.freq, # 频率 + configs.dropout # dropout率 + ) + + # 2.4.4 构建Layer Normalization + self.layer = configs.e_layers + self.layer_norm = nn.LayerNorm(configs.d_model) + + # 2.4.5 构建任务特定的输出层 + if self.task_name == 'anomaly_detection': + # 异常检测任务:将嵌入维度映射回原始特征维度 + self.projection = nn.Linear(configs.d_model, configs.c_out, bias=True) +``` + +### 2.5 TimesBlock初始化 (`models/TimesNet.py`) + +```python +# TimesNet.py:21-31 +class TimesBlock(nn.Module): + def __init__(self, configs): + super(TimesBlock, self).__init__() + self.seq_len = configs.seq_len + self.pred_len = configs.pred_len + self.k = configs.top_k # top-k个主要周期 + + # 参数高效的设计:2D卷积序列 + self.conv = nn.Sequential( + Inception_Block_V1(configs.d_model, configs.d_ff, num_kernels=configs.num_kernels), + nn.GELU(), + Inception_Block_V1(configs.d_ff, configs.d_model, num_kernels=configs.num_kernels) + ) +``` + +--- + +## 📊 阶段三:数据加载初始化 + +### 3.1 数据加载器创建 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:30-32 +def _get_data(self, flag): + data_set, data_loader = data_provider(self.args, flag) # ← 调用数据工厂 + return data_set, data_loader +``` + +### 3.2 数据工厂处理 (`data_provider/data_factory.py`) + +```python +# data_factory.py:21-40 +def data_provider(args, flag): + # 根据数据集名称选择对应的数据加载器类 + Data = data_dict[args.data] # ← 对于PSM数据集,获取PSMSegLoader + + if args.task_name == 'anomaly_detection': + drop_last = False + data_set = Data( + args=args, + root_path=args.root_path, # 数据根目录 + win_size=args.seq_len, # 滑动窗口大小 + flag=flag, # train/val/test标志 + ) + print(flag, len(data_set)) + + data_loader = DataLoader( + data_set, + batch_size=args.batch_size, + shuffle=shuffle_flag, + num_workers=args.num_workers, + drop_last=drop_last + ) + return data_set, data_loader +``` + +### 3.3 PSM数据集初始化 (`data_provider/data_loader.py`) + +```python +# data_loader.py:390-406 +class PSMSegLoader(Dataset): + def __init__(self, args, root_path, win_size, step=1, flag="train"): + self.flag = flag + self.step = step + self.win_size = win_size + + # 3.3.1 创建全局标准化器 + self.scaler = StandardScaler() + + # 3.3.2 加载训练数据并拟合标准化器 + data = pd.read_csv(os.path.join(root_path, 'train.csv')) + data = data.values[:, 1:] # 去掉时间戳列 + data = np.nan_to_num(data) # 处理NaN值 + + # 关键:全局标准化 - 基于训练集统计量 + self.scaler.fit(data) # ← 在训练集上拟合标准化器 + data = self.scaler.transform(data) # ← 标准化训练数据 + + # 3.3.3 加载测试数据并使用相同标准化器 + test_data = pd.read_csv(os.path.join(root_path, 'test.csv')) + test_data = test_data.values[:, 1:] + test_data = np.nan_to_num(test_data) + self.test = self.scaler.transform(test_data) # ← 使用训练集统计量标准化测试集 + + # 3.3.4 保存数据 + self.train = data + data_len = len(self.train) + self.val = self.train[(int)(data_len * 0.8):] # 验证集为训练集后20% + + # 3.3.5 加载测试标签 + self.test_labels = pd.read_csv(os.path.join(root_path, 'test_label.csv')).values[:, 1:] + + print("test:", self.test.shape) + print("train:", self.train.shape) +``` + +--- + +## 🎯 阶段四:训练过程 + +### 4.1 训练主循环 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:62-110 +def train(self, setting): + # 4.1.1 获取数据加载器 + train_data, train_loader = self._get_data(flag='train') + vali_data, vali_loader = self._get_data(flag='val') + test_data, test_loader = self._get_data(flag='test') + + # 4.1.2 设置训练组件 + model_optim = self._select_optimizer() # ← Adam优化器 + criterion = self._select_criterion() # ← MSE损失函数 + + # 4.1.3 Early Stopping + early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) + + # 4.1.4 训练轮次循环 + for epoch in range(self.args.train_epochs): + iter_count = 0 + train_loss = [] + + self.model.train() # 设置为训练模式 + epoch_time = time.time() + + # 4.1.5 批次训练循环 + for i, (batch_x, batch_y) in enumerate(train_loader): + iter_count += 1 + model_optim.zero_grad() + + batch_x = batch_x.float().to(self.device) + + # 4.1.6 前向传播 - 关键步骤 + outputs = self.model(batch_x, None, None, None) # ← 调用forward() + + # 4.1.7 计算重构损失 + f_dim = -1 if self.args.features == 'MS' else 0 + outputs = outputs[:, :, f_dim:] + loss = criterion(outputs, batch_x) # ← MSE重构误差 + train_loss.append(loss.item()) + + # 4.1.8 反向传播和参数更新 + loss.backward() + model_optim.step() + + # 4.1.9 打印训练进度 + if (i + 1) % 100 == 0: + print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())) + + # 4.1.10 验证和Early Stopping + train_loss = np.average(train_loss) + vali_loss = self.vali(vali_data, vali_loader, criterion) + test_loss = self.vali(test_data, test_loader, criterion) + + print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( + epoch + 1, train_steps, train_loss, vali_loss, test_loss)) + + early_stopping(vali_loss, self.model, path) + if early_stopping.early_stop: + print("Early stopping") + break +``` + +### 4.2 前向传播入口 (`models/TimesNet.py`) + +```python +# TimesNet.py:202-216 +def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): + """ + 前向传播的任务分发函数 + 根据task_name路由到不同的处理函数 + """ + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out[:, -self.pred_len:, :] # [B, L, D] + + if self.task_name == 'imputation': + dec_out = self.imputation(x_enc, x_mark_enc, x_dec, x_mark_dec, mask) + return dec_out # [B, L, D] + + if self.task_name == 'anomaly_detection': + dec_out = self.anomaly_detection(x_enc) # ← 异常检测路由 + return dec_out # [B, L, D] + + if self.task_name == 'classification': + dec_out = self.classification(x_enc, x_mark_enc) + return dec_out # [B, N] + + return None +``` + +### 4.3 异常检测核心计算 (`models/TimesNet.py`) + +```python +# TimesNet.py:156-175 +def anomaly_detection(self, x_enc): + """ + 异常检测的核心处理函数 + 输入: x_enc [B, T, C] - 已经全局标准化的时序数据 + 输出: dec_out [B, T, C] - 重构后的时序数据 + """ + + # 4.3.1 局部标准化 (Non-stationary Transformer技术) + means = x_enc.mean(1, keepdim=True).detach() # [B, 1, C] 每个序列的时间均值 + x_enc = x_enc.sub(means) # 去均值 + stdev = torch.sqrt( + torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5) # [B, 1, C] 时间标准差 + x_enc = x_enc.div(stdev) # 标准化 + + # 4.3.2 数据嵌入 - 将原始特征映射到高维空间 + enc_out = self.enc_embedding(x_enc, None) # [B, T, d_model] + + # 4.3.3 多层TimesBlock处理 - 核心特征提取 + for i in range(self.layer): + enc_out = self.layer_norm(self.model[i](enc_out)) # ← 逐层处理 + + # 4.3.4 输出投影 - 映射回原始特征空间 + dec_out = self.projection(enc_out) # [B, T, C] + + # 4.3.5 反标准化 - 恢复原始数据尺度 + dec_out = dec_out.mul( + (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len + self.seq_len, 1))) + dec_out = dec_out.add( + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len + self.seq_len, 1))) + + return dec_out +``` + +### 4.4 TimesBlock核心计算 (`models/TimesNet.py`) + +```python +# TimesNet.py:33-67 +def forward(self, x): # TimesBlock.forward() + """ + TimesBlock的核心处理逻辑 + 1. FFT周期发现 + 2. 多周期2D卷积处理 + 3. 自适应聚合 + """ + B, T, N = x.size() + + # 4.4.1 FFT周期发现 + period_list, period_weight = FFT_for_Period(x, self.k) + + # 4.4.2 多周期处理 + res = [] + for i in range(self.k): + period = period_list[i] + + # 4.4.3 填充到周期整数倍长度 + if (self.seq_len + self.pred_len) % period != 0: + length = (((self.seq_len + self.pred_len) // period) + 1) * period + padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device) + out = torch.cat([x, padding], dim=1) + else: + length = (self.seq_len + self.pred_len) + out = x + + # 4.4.4 重塑为2D矩阵 - 关键创新 + out = out.reshape(B, length // period, period, N).permute(0, 3, 1, 2).contiguous() + # 形状变化: [B, T, N] → [B, N, 周期数, 周期长度] + + # 4.4.5 2D卷积处理 - 捕获周期内和周期间的模式 + out = self.conv(out) # ← Inception卷积块处理 + + # 4.4.6 重塑回1D时序 + out = out.permute(0, 2, 3, 1).reshape(B, -1, N) + res.append(out[:, :(self.seq_len + self.pred_len), :]) + + # 4.4.7 自适应聚合 - 加权融合多个周期的结果 + res = torch.stack(res, dim=-1) # [B, T, N, k] + period_weight = F.softmax(period_weight, dim=1) + period_weight = period_weight.unsqueeze(1).unsqueeze(1).repeat(1, T, N, 1) + res = torch.sum(res * period_weight, -1) # 加权求和 + + # 4.4.8 残差连接 + res = res + x + return res +``` + +### 4.5 FFT周期发现 (`models/TimesNet.py`) + +```python +# TimesNet.py:8-17 +def FFT_for_Period(x, k=2): + """ + 使用FFT发现时序数据的主要周期 + """ + # [B, T, C] + xf = torch.fft.rfft(x, dim=1) # 实数FFT变换 + + # 通过幅度谱找到主要周期 + frequency_list = abs(xf).mean(0).mean(-1) # 平均幅度谱 + frequency_list[0] = 0 # 忽略直流分量 + _, top_list = torch.topk(frequency_list, k) # 找到top-k频率 + top_list = top_list.detach().cpu().numpy() + period = x.shape[1] // top_list # 周期 = 序列长度 / 频率索引 + + return period, abs(xf).mean(-1)[:, top_list] # 返回周期和对应权重 +``` + +--- + +## 📈 阶段五:验证过程 + +### 5.1 验证函数 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:41-61 +def vali(self, vali_data, vali_loader, criterion): + """ + 验证函数:计算验证集上的重构损失 + 用于Early Stopping和模型选择 + """ + total_loss = [] + self.model.eval() # ← 切换到评估模式 + + with torch.no_grad(): # 关闭梯度计算 + for i, (batch_x, _) in enumerate(vali_loader): + batch_x = batch_x.float().to(self.device) + + # 5.1.1 前向传播(无梯度计算) + outputs = self.model(batch_x, None, None, None) + + # 5.1.2 计算验证损失 + f_dim = -1 if self.args.features == 'MS' else 0 + outputs = outputs[:, :, f_dim:] + pred = outputs.detach().cpu() + true = batch_x.detach().cpu() + + loss = criterion(pred, true) + total_loss.append(loss) + + total_loss = np.average(total_loss) + self.model.train() # ← 切换回训练模式 + return total_loss +``` + +--- + +## 🧪 阶段六:测试与异常检测 + +### 6.1 测试主函数 (`exp/exp_anomaly_detection.py`) + +```python +# exp_anomaly_detection.py:126-208 +def test(self, setting, test=0): + """ + 测试函数:执行异常检测并评估性能 + """ + test_data, test_loader = self._get_data(flag='test') + train_data, train_loader = self._get_data(flag='train') + + # 6.1.1 加载最佳模型 + if test: + print('loading model') + self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth'))) + + self.model.eval() + self.anomaly_criterion = nn.MSELoss(reduce=False) # 逐元素MSE + + # 6.1.2 训练集统计 - 建立正常数据的重构误差基线 + attens_energy = [] + with torch.no_grad(): + for i, (batch_x, batch_y) in enumerate(train_loader): + batch_x = batch_x.float().to(self.device) + + # 重构训练数据 + outputs = self.model(batch_x, None, None, None) + + # 计算重构误差 + score = torch.mean(self.anomaly_criterion(batch_x, outputs), dim=-1) + score = score.detach().cpu().numpy() + attens_energy.append(score) + + attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) + train_energy = np.array(attens_energy) + + # 6.1.3 测试集异常检测 + attens_energy = [] + test_labels = [] + for i, (batch_x, batch_y) in enumerate(test_loader): + batch_x = batch_x.float().to(self.device) + + # 重构测试数据 + outputs = self.model(batch_x, None, None, None) + + # 计算重构误差 + score = torch.mean(self.anomaly_criterion(batch_x, outputs), dim=-1) + score = score.detach().cpu().numpy() + attens_energy.append(score) + test_labels.append(batch_y) + + attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) + test_energy = np.array(attens_energy) + + # 6.1.4 阈值确定 - 基于训练集和测试集的联合分布 + combined_energy = np.concatenate([train_energy, test_energy], axis=0) + threshold = np.percentile(combined_energy, 100 - self.args.anomaly_ratio) + print("Threshold :", threshold) + + # 6.1.5 异常判定 - 二值化 + pred = (test_energy > threshold).astype(int) # 重构误差大于阈值则为异常 + test_labels = np.concatenate(test_labels, axis=0).reshape(-1) + test_labels = np.array(test_labels) + gt = test_labels.astype(int) + + print("pred: ", pred.shape) + print("gt: ", gt.shape) + + # 6.1.6 检测调整 - 后处理 + gt, pred = adjustment(gt, pred) # 调整预测结果 + + # 6.1.7 性能评估 + accuracy = accuracy_score(gt, pred) + precision, recall, f_score, support = precision_recall_fscore_support(gt, pred, average='binary') + print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format( + accuracy, precision, recall, f_score)) + + # 6.1.8 保存结果 + f = open("result_anomaly_detection.txt", 'a') + f.write(setting + " \n") + f.write("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format( + accuracy, precision, recall, f_score)) + f.write('\n') + f.write('\n') + f.close() +``` + +--- + +## 🔄 完整调用链总结 + +### 调用流程图 + +``` +📋 启动命令 + ↓ +🚀 run.py:main() + ├── 解析参数 & 设备配置 + ├── 选择Exp_Anomaly_Detection类 + └── 创建实验实例 + ↓ +🏗️ Exp_Anomaly_Detection.__init__() + └── 调用父类 Exp_Basic.__init__() + ├── 注册模型字典 + ├── 获取计算设备 + └── 构建模型: _build_model() + ↓ +🔧 TimesNet.Model.__init__() + ├── 保存配置参数 + ├── 创建TimesBlock模块列表 + ├── 创建数据嵌入层 + ├── 创建LayerNorm + └── 创建输出投影层 + ↓ +📊 数据初始化 + └── _get_data() → data_provider() → PSMSegLoader.__init__() + ├── 创建StandardScaler + ├── 加载训练数据并拟合标准化器 + ├── 全局标准化变换 + └── 加载测试数据并标准化 + ↓ +🎯 训练阶段: exp.train(setting) + ├── 获取数据加载器 + ├── 设置优化器和损失函数 + └── 训练循环: + for epoch in range(train_epochs): + for batch in train_loader: + ├── 前向传播: model(batch_x) + │ └── forward() → anomaly_detection() + │ ├── 局部标准化 + │ ├── 数据嵌入 + │ ├── 多层TimesBlock处理 + │ │ ├── FFT周期发现 + │ │ ├── 多周期2D卷积 + │ │ ├── 自适应聚合 + │ │ └── 残差连接 + │ ├── 输出投影 + │ └── 反标准化 + ├── 计算MSE重构损失 + ├── 反向传播 + └── 参数更新 + ├── 验证: vali() + └── Early Stopping检查 + ↓ +🧪 测试阶段: exp.test(setting) + ├── 加载最佳模型 + ├── 训练集重构误差统计 + ├── 测试集重构误差计算 + ├── 阈值确定 (百分位数) + ├── 异常判定 (二值化) + ├── 后处理调整 + └── 性能评估 (Precision, Recall, F1) +``` + +### 关键函数调用频次 + +| 函数 | 调用次数 | 说明 | +|------|----------|------| +| `TimesNet.__init__()` | 1次 | 程序启动时调用一次 | +| `PSMSegLoader.__init__()` | 3次 | train/val/test各一次 | +| `forward()` | N×M次 | N个epoch × M个batch | +| `anomaly_detection()` | N×M次 | 每次前向传播调用 | +| `TimesBlock.forward()` | N×M×L次 | L为层数(e_layers) | +| `FFT_for_Period()` | N×M×L次 | 每个TimesBlock都调用 | +| `PSMSegLoader.__getitem__()` | 数据总量次 | 每个样本调用一次 | + +### 数据流形状变化 + +``` +原始CSV数据 → 加载与预处理 + ↓ +全局标准化: StandardScaler.transform() + ↓ +滑动窗口切分: [B, T, C] (batch_size, seq_len, features) + ↓ +局部标准化: 每个样本在时间维度标准化 [B, T, C] + ↓ +数据嵌入: [B, T, C] → [B, T, d_model] + ↓ +TimesBlock处理: + FFT周期发现: [B, T, d_model] → 周期列表 + 1D→2D变换: [B, T, d_model] → [B, d_model, 周期数, 周期长度] + 2D卷积: Inception块处理 + 2D→1D变换: [B, d_model, 周期数, 周期长度] → [B, T, d_model] + 自适应聚合: 多周期结果加权融合 + ↓ +输出投影: [B, T, d_model] → [B, T, C] + ↓ +反标准化: 恢复到原始数据尺度 [B, T, C] + ↓ +重构误差计算: MSE(原始, 重构) → 异常分数 +``` + +### 关键设计原理 + +1. **双层标准化策略**: + - 全局标准化: 解决特征尺度问题 + - 局部标准化: 解决非平稳时序问题 + +2. **TimesNet核心创新**: + - FFT自动发现周期性 + - 1D→2D变换利用卷积捕获模式 + - 多周期自适应聚合 + +3. **异常检测原理**: + - 基于重构误差的无监督方法 + - 阈值基于训练集统计确定 + - 后处理调整提高实用性 + +4. **端到端训练**: + - 统一的重构损失函数 + - Early Stopping防止过拟合 + - 多GPU支持大规模训练 + +--- + +## 📝 总结 + +TimesNet在异常检测任务中展现了以下特点: + +1. **模块化设计**: 清晰的分层架构,便于理解和扩展 +2. **高效训练**: 端到端的训练流程,自动化程度高 +3. **技术创新**: FFT周期发现 + 2D卷积的独特组合 +4. **工程实用**: 完善的数据处理、模型保存、性能评估流程 + +这个完整的流程涵盖了从命令行启动到最终结果输出的每一个关键步骤,为理解和使用TimesNet提供了详细的参考。 \ No newline at end of file diff --git a/auto_activate_setup.sh b/auto_activate_setup.sh new file mode 100755 index 000000000..6367e0a64 --- /dev/null +++ b/auto_activate_setup.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# TimesNet项目自动激活设置脚本 +# 运行此脚本将在你的.bashrc中添加自动激活功能 + +TIMESNET_PATH="/home/wanting/TimesNet" +BASHRC_FILE="$HOME/.bashrc" + +echo "🔧 正在设置TimesNet项目虚拟环境自动激活..." + +# 检查是否已经添加过配置 +if grep -q "# TimesNet Auto Activation" "$BASHRC_FILE" 2>/dev/null; then + echo "⚠️ .bashrc中已存在TimesNet自动激活配置" + echo "如需重新配置,请先手动删除现有配置后再运行此脚本" + exit 1 +fi + +# 添加自动激活功能到.bashrc +cat >> "$BASHRC_FILE" << 'EOF' + +# TimesNet Auto Activation +# 当进入TimesNet项目目录时自动激活虚拟环境 +timesnet_auto_activate() { + local current_dir="$(pwd)" + local timesnet_path="/home/wanting/TimesNet" + + # 检查当前目录是否在TimesNet项目路径下 + if [[ "$current_dir" == "$timesnet_path"* ]]; then + local venv_path="$timesnet_path/venv" + + # 如果虚拟环境存在且尚未激活 + if [ -d "$venv_path" ] && [ "$VIRTUAL_ENV" != "$venv_path" ]; then + echo "🚀 检测到TimesNet项目,正在激活虚拟环境..." + source "$venv_path/bin/activate" + echo "✅ TimesNet虚拟环境已激活" + fi + else + # 如果离开TimesNet项目目录且激活的是TimesNet的虚拟环境,则deactivate + if [ "$VIRTUAL_ENV" == "$timesnet_path/venv" ]; then + echo "👋 离开TimesNet项目,正在关闭虚拟环境..." + deactivate + fi + fi +} + +# 在每次cd命令后执行检查 +cd() { + builtin cd "$@" + timesnet_auto_activate +} + +# 在启动新shell时检查当前目录 +timesnet_auto_activate +EOF + +echo "✅ 自动激活配置已添加到 $BASHRC_FILE" +echo "" +echo "📋 使用说明:" +echo "1. 重新加载.bashrc: source ~/.bashrc" +echo "2. 或者重新打开终端" +echo "3. 当你cd到TimesNet项目目录时,虚拟环境会自动激活" +echo "4. 当你离开项目目录时,虚拟环境会自动关闭" +echo "" +echo "🔧 如需移除自动激活功能,请手动编辑 $BASHRC_FILE 并删除 '# TimesNet Auto Activation' 部分" \ No newline at end of file diff --git a/data_preprocess/analyze_time_continuity.py b/data_preprocess/analyze_time_continuity.py new file mode 100644 index 000000000..17aec3129 --- /dev/null +++ b/data_preprocess/analyze_time_continuity.py @@ -0,0 +1,365 @@ +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import os +from utils.logger import setup_logger +import matplotlib.pyplot as plt +import seaborn as sns +from typing import Dict, Tuple, List +import warnings +warnings.filterwarnings('ignore') + +def load_minute_sampled_data(file_path: str) -> pd.DataFrame: + """ + Load the minute-sampled parquet data + + Args: + file_path: Path to the parquet file + + Returns: + pd.DataFrame: Loaded dataframe + """ + logger.info(f"Loading data from {file_path}") + df = pd.read_parquet(file_path) + logger.info(f"Data loaded successfully. Shape: {df.shape}") + logger.info(f"Columns: {list(df.columns)}") + + # Display basic info about the dataset + logger.info(f"Data types:\n{df.dtypes}") + logger.info(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") + + return df + +def identify_time_column(df: pd.DataFrame) -> str: + """ + Identify the time column in the dataframe + + Args: + df: Input dataframe + + Returns: + str: Name of the time column + """ + logger.info("Identifying time column...") + + # Common time column names + time_cols = ['timestamp', 'time', 'datetime', 'date', 'ts', 'Time', 'Timestamp', 'DateTime'] + + # Check for explicit time column names + for col in time_cols: + if col in df.columns: + logger.info(f"Found time column: {col}") + return col + + # Check for columns with datetime-like data types + for col in df.columns: + if pd.api.types.is_datetime64_any_dtype(df[col]): + logger.info(f"Found datetime column: {col}") + return col + + # Check for columns that can be converted to datetime + for col in df.columns: + try: + pd.to_datetime(df[col].head(100)) + logger.info(f"Found convertible datetime column: {col}") + return col + except: + continue + + # If index is datetime + if pd.api.types.is_datetime64_any_dtype(df.index): + logger.info("Using datetime index as time column") + return 'index' + + logger.warning("No time column found. Please specify manually.") + return None + +def prepare_time_data(df: pd.DataFrame, time_col: str) -> pd.DataFrame: + """ + Prepare and standardize time data + + Args: + df: Input dataframe + time_col: Name of the time column + + Returns: + pd.DataFrame: Dataframe with prepared time data + """ + logger.info(f"Preparing time data using column: {time_col}") + + df_copy = df.copy() + + if time_col == 'index': + df_copy['timestamp'] = df_copy.index + time_col = 'timestamp' + + # Convert to datetime if not already + if not pd.api.types.is_datetime64_any_dtype(df_copy[time_col]): + logger.info("Converting to datetime...") + df_copy[time_col] = pd.to_datetime(df_copy[time_col]) + + # Sort by time + df_copy = df_copy.sort_values(time_col).reset_index(drop=True) + + logger.info(f"Time range: {df_copy[time_col].min()} to {df_copy[time_col].max()}") + logger.info(f"Total time span: {df_copy[time_col].max() - df_copy[time_col].min()}") + + return df_copy, time_col + +def analyze_time_continuity(df: pd.DataFrame, time_col: str) -> Dict: + """ + Analyze time continuity of the dataset + + Args: + df: Input dataframe with time data + time_col: Name of the time column + + Returns: + Dict: Analysis results + """ + logger.info("Starting time continuity analysis...") + + # Basic statistics + time_series = df[time_col] + total_records = len(df) + + # Calculate time differences + time_diffs = time_series.diff().dropna() + + # Expected interval (1 minute for minute-sampled data) + expected_interval = timedelta(minutes=1) + + # Identify gaps + gaps = time_diffs[time_diffs > expected_interval] + + # Analysis results + results = { + 'total_records': total_records, + 'time_range': { + 'start': time_series.min(), + 'end': time_series.max(), + 'duration': time_series.max() - time_series.min() + }, + 'expected_interval': expected_interval, + 'actual_intervals': { + 'mean': time_diffs.mean(), + 'median': time_diffs.median(), + 'std': time_diffs.std(), + 'min': time_diffs.min(), + 'max': time_diffs.max() + }, + 'gaps': { + 'total_gaps': len(gaps), + 'gap_positions': gaps.index.tolist(), + 'gap_durations': gaps.values, + 'largest_gap': gaps.max() if len(gaps) > 0 else timedelta(0), + 'total_missing_time': gaps.sum() if len(gaps) > 0 else timedelta(0) + }, + 'continuity_stats': {} + } + + # Calculate expected vs actual records + if results['time_range']['duration'] > timedelta(0): + expected_records = int(results['time_range']['duration'].total_seconds() / 60) + 1 + continuity_percentage = (total_records / expected_records) * 100 + missing_records = expected_records - total_records + else: + expected_records = total_records + continuity_percentage = 100.0 + missing_records = 0 + + results['continuity_stats'] = { + 'expected_records': expected_records, + 'actual_records': total_records, + 'missing_records': missing_records, + 'continuity_percentage': continuity_percentage + } + + # Duplicate timestamps + duplicates = time_series.duplicated().sum() + results['duplicates'] = { + 'count': duplicates, + 'percentage': (duplicates / total_records) * 100 + } + + logger.info(f"Analysis completed. Found {len(gaps)} gaps in time series") + logger.info(f"Continuity percentage: {continuity_percentage:.2f}%") + + return results + +def generate_time_analysis_report(results: Dict) -> str: + """ + Generate a comprehensive report of time analysis + + Args: + results: Analysis results dictionary + + Returns: + str: Formatted report + """ + logger.info("Generating time analysis report...") + + report = [] + report.append("=" * 80) + report.append("TIME CONTINUITY ANALYSIS REPORT") + report.append("=" * 80) + report.append("") + + # Basic Information + report.append("📊 BASIC INFORMATION") + report.append("-" * 40) + report.append(f"Total Records: {results['total_records']:,}") + report.append(f"Time Range: {results['time_range']['start']} to {results['time_range']['end']}") + report.append(f"Duration: {results['time_range']['duration']}") + report.append(f"Expected Interval: {results['expected_interval']}") + report.append("") + + # Continuity Statistics + report.append("📈 CONTINUITY STATISTICS") + report.append("-" * 40) + report.append(f"Expected Records: {results['continuity_stats']['expected_records']:,}") + report.append(f"Actual Records: {results['continuity_stats']['actual_records']:,}") + report.append(f"Missing Records: {results['continuity_stats']['missing_records']:,}") + report.append(f"Continuity Percentage: {results['continuity_stats']['continuity_percentage']:.2f}%") + report.append("") + + # Interval Analysis + report.append("⏱️ INTERVAL ANALYSIS") + report.append("-" * 40) + report.append(f"Mean Interval: {results['actual_intervals']['mean']}") + report.append(f"Median Interval: {results['actual_intervals']['median']}") + report.append(f"Std Deviation: {results['actual_intervals']['std']}") + report.append(f"Min Interval: {results['actual_intervals']['min']}") + report.append(f"Max Interval: {results['actual_intervals']['max']}") + report.append("") + + # Gap Analysis + report.append("🕳️ GAP ANALYSIS") + report.append("-" * 40) + report.append(f"Total Gaps: {results['gaps']['total_gaps']}") + report.append(f"Largest Gap: {results['gaps']['largest_gap']}") + report.append(f"Total Missing Time: {results['gaps']['total_missing_time']}") + + if results['gaps']['total_gaps'] > 0: + report.append("\nTop 10 Largest Gaps:") + gap_durations = sorted(results['gaps']['gap_durations'], reverse=True)[:10] + for i, gap in enumerate(gap_durations, 1): + report.append(f" {i}. {gap}") + report.append("") + + # Duplicate Analysis + report.append("🔄 DUPLICATE ANALYSIS") + report.append("-" * 40) + report.append(f"Duplicate Timestamps: {results['duplicates']['count']}") + report.append(f"Duplicate Percentage: {results['duplicates']['percentage']:.2f}%") + report.append("") + + # Quality Assessment + report.append("✅ QUALITY ASSESSMENT") + report.append("-" * 40) + continuity_pct = results['continuity_stats']['continuity_percentage'] + if continuity_pct >= 95: + quality = "EXCELLENT" + emoji = "🟢" + elif continuity_pct >= 90: + quality = "GOOD" + emoji = "🟡" + elif continuity_pct >= 80: + quality = "FAIR" + emoji = "🟠" + else: + quality = "POOR" + emoji = "🔴" + + report.append(f"Overall Quality: {emoji} {quality}") + report.append(f"Data Completeness: {continuity_pct:.2f}%") + + if results['gaps']['total_gaps'] == 0: + report.append("✅ No gaps detected - Perfect continuity!") + else: + report.append(f"⚠️ {results['gaps']['total_gaps']} gaps detected") + + if results['duplicates']['count'] == 0: + report.append("✅ No duplicate timestamps") + else: + report.append(f"⚠️ {results['duplicates']['count']} duplicate timestamps") + + report.append("") + report.append("=" * 80) + + return "\n".join(report) + +def save_analysis_results(results: Dict, report: str, output_dir: str = "dataset/analysis"): + """ + Save analysis results to files + + Args: + results: Analysis results dictionary + report: Generated report string + output_dir: Output directory for saving results + """ + logger.info(f"Saving analysis results to {output_dir}") + + os.makedirs(output_dir, exist_ok=True) + + # Save detailed results as pickle + import pickle + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + with open(f"{output_dir}/time_continuity_results_{timestamp}.pkl", "wb") as f: + pickle.dump(results, f) + + # Save report as text file + with open(f"{output_dir}/time_continuity_report_{timestamp}.txt", "w", encoding='utf-8') as f: + f.write(report) + + logger.info("Analysis results saved successfully") + + +def main(): + """ + Main function to perform time continuity analysis + """ + # File path + file_path = "dataset/data_preprocess/minute_sampled_Contacting_cleaned.parquet" + + if not os.path.exists(file_path): + logger.error(f"File not found: {file_path}") + return + + try: + # Step 1: Load data + df = load_minute_sampled_data(file_path) + + # Step 2: Identify time column + time_col = identify_time_column(df) + if time_col is None: + logger.error("Could not identify time column. Please check your data.") + return + + # Step 3: Prepare time data + df_prepared, time_col = prepare_time_data(df, time_col) + + # Step 4: Analyze time continuity + analysis_results = analyze_time_continuity(df_prepared, time_col) + + # Step 5: Generate report + report = generate_time_analysis_report(analysis_results) + + # Step 6: Print report + print(report) + + # Step 7: Save results + save_analysis_results(analysis_results, report) + + + logger.info("Time continuity analysis completed successfully!") + + except Exception as e: + logger.error(f"Error during analysis: {str(e)}") + raise + +if __name__ == "__main__": + # Initialize logger + logger = setup_logger("analyze_time_continuity") + main() \ No newline at end of file diff --git a/data_preprocess/split_train_test.py b/data_preprocess/split_train_test.py new file mode 100644 index 000000000..32fbf3912 --- /dev/null +++ b/data_preprocess/split_train_test.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd +import os + +def split_time_series_data(file_path, output_dir='dataset/split_data', time_col='TimeStamp', train_size=0.75): + """ + Split time series data into train and test sets based on time sorting. + + Args: + file_path (str): Path to the input Parquet file + output_dir (str): Directory to save the output files + time_col (str): Name of the timestamp column + train_size (float): Proportion of data to use for training + """ + print(f"Loading data from: {file_path}") + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Read parquet file + df = pd.read_parquet(file_path) + print(f"Data loaded. Total rows: {len(df)}") + + # Sort by timestamp + print(f"Sorting data by {time_col}...") + df = df.sort_values(by=time_col) + + # Calculate split point + split_idx = int(len(df) * train_size) + + # Split data + train_df = df.iloc[:split_idx] + test_df = df.iloc[split_idx:] + + print(f"Data split: train={len(train_df)} rows, test={len(test_df)} rows") + + # Get file name without extension + base_name = os.path.splitext(os.path.basename(file_path))[0] + + # Save train and test data + train_path = os.path.join(output_dir, f"train.parquet") + test_path = os.path.join(output_dir, f"test.parquet") + + print(f"Saving train data to: {train_path}") + train_df.to_parquet(train_path, index=False) + + print(f"Saving test data to: {test_path}") + test_df.to_parquet(test_path, index=False) + + # Print time range for each set + print("\nTime ranges:") + print(f"Train: {train_df[time_col].min()} to {train_df[time_col].max()}") + print(f"Test: {test_df[time_col].min()} to {test_df[time_col].max()}") + + print("\nData split and saved successfully!") + +if __name__ == "__main__": + # Input file path + file_path = "dataset/data_preprocess/minute_sampled_Contacting_cleaned.parquet" + + # Split data with default parameters + split_time_series_data(file_path) \ No newline at end of file diff --git a/data_preprocess/subsample_minute.py b/data_preprocess/subsample_minute.py new file mode 100644 index 000000000..40e46d617 --- /dev/null +++ b/data_preprocess/subsample_minute.py @@ -0,0 +1,292 @@ +""" +数据下采样模块:将按秒采样的数据下采样至分钟级别 +""" + +import pandas as pd +import numpy as np +import os +from pathlib import Path +from typing import Optional, List + +# 添加项目根目录到Python路径 +import sys +sys.path.append('/home/wanting/Time-Series-Library') + +# 导入自定义logger +from utils.logger import setup_logger + +# 创建专用logger +logger = setup_logger("subsample_minute") + + +def load_data(file_path: str) -> pd.DataFrame: + """ + 加载parquet数据文件 + + Args: + file_path: 数据文件路径 + + Returns: + 加载的DataFrame + """ + logger.info(f"开始加载数据文件: {file_path}") + + if not os.path.exists(file_path): + logger.error(f"数据文件不存在: {file_path}") + raise FileNotFoundError(f"数据文件不存在: {file_path}") + + try: + df = pd.read_parquet(file_path) + logger.info(f"数据加载成功,形状: {df.shape}") + logger.info(f"列名: {list(df.columns)}") + return df + except Exception as e: + logger.error(f"加载数据文件失败: {e}") + raise + + +def remove_columns(df: pd.DataFrame, columns_to_remove: List[str]) -> pd.DataFrame: + """ + 移除指定列 + + Args: + df: 输入DataFrame + columns_to_remove: 要移除的列名列表 + + Returns: + 移除指定列后的DataFrame + """ + logger.info(f"移除列: {columns_to_remove}") + + existing_columns = [col for col in columns_to_remove if col in df.columns] + missing_columns = [col for col in columns_to_remove if col not in df.columns] + + if missing_columns: + logger.warning(f"以下列不存在,将被忽略: {missing_columns}") + + if existing_columns: + df_cleaned = df.drop(columns=existing_columns) + logger.info(f"成功移除列: {existing_columns}") + logger.info(f"移除后数据形状: {df_cleaned.shape}") + return df_cleaned + else: + logger.warning("没有找到要移除的列") + return df.copy() + + +def prepare_timestamp_column(df: pd.DataFrame, timestamp_col: str = 'TimeStamp') -> pd.DataFrame: + """ + 准备时间戳列,确保正确的时间格式并按时间戳排序 + + Args: + df: 输入DataFrame + timestamp_col: 时间戳列名 + + Returns: + 处理后的DataFrame + """ + logger.info(f"处理时间戳列: {timestamp_col}") + + if timestamp_col not in df.columns: + logger.error(f"时间戳列 '{timestamp_col}' 不存在") + raise ValueError(f"时间戳列 '{timestamp_col}' 不存在") + + # 确保时间戳是datetime格式 + if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]): + logger.info("转换时间戳列为datetime格式") + df[timestamp_col] = pd.to_datetime(df[timestamp_col]) + + logger.info(f"排序前时间戳范围: {df[timestamp_col].min()} 到 {df[timestamp_col].max()}") + + # 按时间戳排序 + logger.info("按时间戳排序数据...") + df_sorted = df.sort_values(by=timestamp_col).reset_index(drop=True) + + logger.info(f"排序后时间戳范围: {df_sorted[timestamp_col].min()} 到 {df_sorted[timestamp_col].max()}") + logger.info(f"时间戳列数据类型: {df_sorted[timestamp_col].dtype}") + logger.info(f"数据是否按时间戳排序: {df_sorted[timestamp_col].is_monotonic_increasing}") + + return df_sorted + + +def subsample_to_minute(df: pd.DataFrame, timestamp_col: str = 'TimeStamp', + aggregation_method: str = 'mean', label: str = 'left') -> pd.DataFrame: + """ + 将按秒采样的数据下采样至分钟级别 + + Args: + df: 输入DataFrame + timestamp_col: 时间戳列名 + aggregation_method: 聚合方法 ('mean', 'median', 'first', 'last', 'max', 'min') + label: 时间戳标签位置 ('left', 'right', 'center') + - 'left': 使用时间窗口的开始时间 (默认) + - 'right': 使用时间窗口的结束时间 + - 'center': 使用时间窗口的中间时间 + + Returns: + 下采样后的DataFrame + """ + logger.info(f"开始下采样至分钟级别,聚合方法: {aggregation_method}, 时间戳标签: {label}") + + # 设置时间戳为索引 + df_indexed = df.set_index(timestamp_col) + + # 获取数值列 + numeric_columns = df_indexed.select_dtypes(include=[np.number]).columns + logger.info(f"数值列数量: {len(numeric_columns)}") + + # 确定时间戳标签参数 + if label == 'left': + resample_label = 'left' + elif label == 'right': + resample_label = 'right' + elif label == 'center': + resample_label = 'left' # 先用left,后面会调整到center + else: + logger.warning(f"未知的标签类型 '{label}',使用默认的'left'") + resample_label = 'left' + + # 下采样到分钟级别 + logger.info("执行下采样操作...") + + if aggregation_method == 'mean': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).mean() + elif aggregation_method == 'median': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).median() + elif aggregation_method == 'first': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).first() + elif aggregation_method == 'last': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).last() + elif aggregation_method == 'max': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).max() + elif aggregation_method == 'min': + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).min() + else: + logger.warning(f"未知的聚合方法 '{aggregation_method}',使用默认的'mean'") + df_resampled = df_indexed[numeric_columns].resample('1min', label=resample_label).mean() + + # 如果是center标签,需要调整时间戳到中间点 + if label == 'center': + logger.info("调整时间戳到时间窗口中心点...") + df_resampled.index = df_resampled.index + pd.Timedelta(seconds=30) + + # 重置索引,将时间戳重新设为列 + df_resampled = df_resampled.reset_index() + + # 移除包含NaN的行(可能是由于某些分钟没有数据) + initial_rows = len(df_resampled) + df_resampled = df_resampled.dropna() + final_rows = len(df_resampled) + + if initial_rows != final_rows: + logger.info(f"移除了 {initial_rows - final_rows} 行包含NaN的数据") + + logger.info(f"下采样完成,新数据形状: {df_resampled.shape}") + logger.info(f"新时间戳范围: {df_resampled[timestamp_col].min()} 到 {df_resampled[timestamp_col].max()}") + + return df_resampled + + +def save_processed_data(df: pd.DataFrame, output_path: str) -> None: + """ + 保存处理后的数据到parquet格式 + + Args: + df: 要保存的DataFrame + output_path: 输出文件路径 + """ + logger.info(f"保存处理后的数据到: {output_path}") + + # 创建输出目录 + output_dir = os.path.dirname(output_path) + os.makedirs(output_dir, exist_ok=True) + + try: + df.to_parquet(output_path, index=False) + logger.info(f"数据保存成功,文件大小: {os.path.getsize(output_path) / (1024*1024):.2f} MB") + except Exception as e: + logger.error(f"保存数据失败: {e}") + raise + + +def process_subsample_minute(input_file: str, + output_file: Optional[str] = None, + columns_to_remove: List[str] = ['ID', 'Station'], + aggregation_method: str = 'mean', + timestamp_col: str = 'TimeStamp', + label: str = 'left') -> str: + """ + 完整的下采样处理流程 + + Args: + input_file: 输入文件路径 + output_file: 输出文件路径(可选) + columns_to_remove: 要移除的列名列表 + aggregation_method: 聚合方法 + timestamp_col: 时间戳列名 + label: 时间戳标签位置 ('left', 'right', 'center') + + Returns: + 输出文件路径 + """ + logger.info("="*50) + logger.info("开始数据下采样处理流程") + logger.info("="*50) + + # 步骤1: 加载数据 + df = load_data(input_file) + + # 步骤2: 移除指定列 + df_cleaned = remove_columns(df, columns_to_remove) + + # 步骤3: 准备时间戳列 + df_prepared = prepare_timestamp_column(df_cleaned, timestamp_col) + + # 步骤4: 下采样至分钟级别 + df_subsampled = subsample_to_minute(df_prepared, timestamp_col, aggregation_method, label) + + # 步骤5: 生成输出文件路径 + if output_file is None: + input_path = Path(input_file) + output_file = f"dataset/data_preprocess/minute_sampled_{input_path.stem}.parquet" + + # 步骤6: 保存处理后的数据 + save_processed_data(df_subsampled, output_file) + + logger.info("="*50) + logger.info("数据下采样处理流程完成") + logger.info(f"输入文件: {input_file}") + logger.info(f"输出文件: {output_file}") + logger.info(f"原始数据形状: {df.shape}") + logger.info(f"处理后数据形状: {df_subsampled.shape}") + logger.info("="*50) + + return output_file + + +def main(): + """ + 主函数,执行数据下采样处理 + """ + try: + # 输入文件路径 + input_file = "dataset/origin_data/cleaning_utc/Contacting_cleaned.parquet" + + # 执行下采样处理 + output_file = process_subsample_minute( + input_file=input_file, + columns_to_remove=['ID', 'Station'], + aggregation_method='mean', + timestamp_col='TimeStamp', + label='center' + ) + + logger.info(f"处理完成!输出文件: {output_file}") + + except Exception as e: + logger.error(f"处理过程中发生错误: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/data_preprocess/supervise/label_anomaly_test_train.py b/data_preprocess/supervise/label_anomaly_test_train.py new file mode 100644 index 000000000..80fd0594d --- /dev/null +++ b/data_preprocess/supervise/label_anomaly_test_train.py @@ -0,0 +1,241 @@ +import pandas as pd +import pickle +import numpy as np +from utils.logger import logger +from tqdm import tqdm + + +def label_anomaly_data(train_path: str = "dataset/split_data/train.parquet", + test_path: str = "dataset/split_data/test.parquet", + anomaly_dict_path: str = "dataset/origin_data/anomaly_dict_merged.pkl", + station_key: str = "Kontaktieren", + output_train_path: str = "dataset/split_data/train_labeled.parquet", + output_test_path: str = "dataset/split_data/test_labeled.parquet"): + """ + 对时间序列数据进行异常标签操作 + + Args: + train_path: 训练数据路径 + test_path: 测试数据路径 + anomaly_dict_path: 异常字典pkl文件路径 + station_key: 站点名称键值 (默认: "Kontaktieren") + output_train_path: 输出训练数据路径 + output_test_path: 输出测试数据路径 + + Returns: + tuple: (labeled_train_df, labeled_test_df) 标记后的训练和测试数据 + """ + + logger.info("开始加载数据文件...") + + # 加载数据 + try: + train_df = pd.read_parquet(train_path) + test_df = pd.read_parquet(test_path) + logger.info(f"训练数据形状: {train_df.shape}") + logger.info(f"测试数据形状: {test_df.shape}") + + with open(anomaly_dict_path, 'rb') as f: + anomaly_dict = pickle.load(f) + logger.info(f"异常字典包含的站点: {list(anomaly_dict.keys())}") + + except Exception as e: + logger.error(f"数据加载失败: {e}") + raise + + # 检查站点键是否存在 + if station_key not in anomaly_dict: + available_keys = list(anomaly_dict.keys()) + logger.error(f"站点键 '{station_key}' 不在异常字典中。可用键: {available_keys}") + raise ValueError(f"站点键 '{station_key}' 不存在") + + # 获取异常时间段 + anomaly_periods = anomaly_dict[station_key] + logger.info(f"站点 '{station_key}' 共有 {len(anomaly_periods)} 个异常时间段") + + # 为训练数据添加标签 + logger.info("开始为训练数据添加标签...") + train_df_labeled = _add_anomaly_labels(train_df, anomaly_periods, "训练") + + # 为测试数据添加标签 + logger.info("开始为测试数据添加标签...") + test_df_labeled = _add_anomaly_labels(test_df, anomaly_periods, "测试") + + # 保存标记后的数据 + logger.info("保存标记后的数据...") + train_df_labeled.to_parquet(output_train_path, index=False) + test_df_labeled.to_parquet(output_test_path, index=False) + + # 输出统计信息 + _print_label_statistics(train_df_labeled, test_df_labeled) + + logger.info(f"标记完成!训练数据保存至: {output_train_path}") + logger.info(f"标记完成!测试数据保存至: {output_test_path}") + + return train_df_labeled, test_df_labeled + + +def _add_anomaly_labels(df: pd.DataFrame, anomaly_periods: list, dataset_name: str) -> pd.DataFrame: + """ + 为数据添加异常标签 + + Args: + df: 输入数据框 + anomaly_periods: 异常时间段列表,每个元素为(start_time, end_time)的元组 + dataset_name: 数据集名称,用于日志显示 + + Returns: + pd.DataFrame: 添加了label列的数据框 + """ + + # 复制数据框 + df_labeled = df.copy() + + # 初始化所有标签为0(正常) + df_labeled['label'] = 0 + + # 确保TimeStamp列是datetime类型且带时区信息 + if not pd.api.types.is_datetime64_any_dtype(df_labeled['TimeStamp']): + df_labeled['TimeStamp'] = pd.to_datetime(df_labeled['TimeStamp']) + + # 如果TimeStamp没有时区信息,假设为UTC + if df_labeled['TimeStamp'].dt.tz is None: + df_labeled['TimeStamp'] = df_labeled['TimeStamp'].dt.tz_localize('UTC') + + anomaly_count = 0 + total_anomaly_periods = len(anomaly_periods) + + logger.info(f"开始处理 {dataset_name} 数据的 {total_anomaly_periods} 个异常时间段...") + + # 遍历所有异常时间段 + for i, (start_time, end_time) in enumerate(tqdm(anomaly_periods, desc=f"处理{dataset_name}数据异常时间段")): + # 确保异常时间段的时间戳也有时区信息 + if hasattr(start_time, 'tz') and start_time.tz is None: + start_time = start_time.tz_localize('UTC') + if hasattr(end_time, 'tz') and end_time.tz is None: + end_time = end_time.tz_localize('UTC') + + # 找到在异常时间段内的数据点 + mask = (df_labeled['TimeStamp'] >= start_time) & (df_labeled['TimeStamp'] <= end_time) + anomaly_points = mask.sum() + + if anomaly_points > 0: + df_labeled.loc[mask, 'label'] = 1 + anomaly_count += anomaly_points + + # 每处理100个时间段输出一次进度 + if (i + 1) % 100 == 0: + logger.info(f"已处理 {i + 1}/{total_anomaly_periods} 个异常时间段") + + logger.info(f"{dataset_name} 数据标记完成,共标记 {anomaly_count} 个异常点") + + return df_labeled + + +def _print_label_statistics(train_df: pd.DataFrame, test_df: pd.DataFrame): + """ + 打印标签统计信息 + + Args: + train_df: 训练数据框 + test_df: 测试数据框 + """ + + logger.info("=" * 50) + logger.info("标签统计信息:") + logger.info("=" * 50) + + # 训练数据统计 + train_normal = (train_df['label'] == 0).sum() + train_anomaly = (train_df['label'] == 1).sum() + train_total = len(train_df) + train_anomaly_ratio = train_anomaly / train_total * 100 + + logger.info(f"训练数据:") + logger.info(f" 总样本数: {train_total:,}") + logger.info(f" 正常样本: {train_normal:,} ({100-train_anomaly_ratio:.2f}%)") + logger.info(f" 异常样本: {train_anomaly:,} ({train_anomaly_ratio:.2f}%)") + + # 测试数据统计 + test_normal = (test_df['label'] == 0).sum() + test_anomaly = (test_df['label'] == 1).sum() + test_total = len(test_df) + test_anomaly_ratio = test_anomaly / test_total * 100 + + logger.info(f"测试数据:") + logger.info(f" 总样本数: {test_total:,}") + logger.info(f" 正常样本: {test_normal:,} ({100-test_anomaly_ratio:.2f}%)") + logger.info(f" 异常样本: {test_anomaly:,} ({test_anomaly_ratio:.2f}%)") + + # 总体统计 + total_samples = train_total + test_total + total_anomalies = train_anomaly + test_anomaly + total_anomaly_ratio = total_anomalies / total_samples * 100 + + logger.info(f"总体统计:") + logger.info(f" 总样本数: {total_samples:,}") + logger.info(f" 异常样本: {total_anomalies:,} ({total_anomaly_ratio:.2f}%)") + logger.info("=" * 50) + + +def verify_labeled_data(train_path: str = "dataset/split_data/train_labeled.parquet", + test_path: str = "dataset/split_data/test_labeled.parquet"): + """ + 验证标记后的数据 + + Args: + train_path: 标记后的训练数据路径 + test_path: 标记后的测试数据路径 + """ + + logger.info("开始验证标记后的数据...") + + try: + train_df = pd.read_parquet(train_path) + test_df = pd.read_parquet(test_path) + + # 检查是否有label列 + if 'label' not in train_df.columns: + logger.error("训练数据中没有找到label列") + return False + if 'label' not in test_df.columns: + logger.error("测试数据中没有找到label列") + return False + + # 检查label值是否只包含0和1 + train_unique_labels = set(train_df['label'].unique()) + test_unique_labels = set(test_df['label'].unique()) + + expected_labels = {0, 1} + if not train_unique_labels.issubset(expected_labels): + logger.error(f"训练数据label值异常: {train_unique_labels}") + return False + if not test_unique_labels.issubset(expected_labels): + logger.error(f"测试数据label值异常: {test_unique_labels}") + return False + + # 打印验证统计 + _print_label_statistics(train_df, test_df) + + logger.info("数据验证通过!") + return True + + except Exception as e: + logger.error(f"数据验证失败: {e}") + return False + + +if __name__ == "__main__": + # 示例用法 + try: + # 执行标签操作 + train_labeled, test_labeled = label_anomaly_data() + + # 验证结果 + verify_labeled_data() + + logger.info("异常标签操作成功完成!") + + except Exception as e: + logger.error(f"执行失败: {e}") + raise \ No newline at end of file diff --git a/data_preprocess/unsupervise/anomaly_filtered_Traindata.py b/data_preprocess/unsupervise/anomaly_filtered_Traindata.py new file mode 100644 index 000000000..685b8cf3d --- /dev/null +++ b/data_preprocess/unsupervise/anomaly_filtered_Traindata.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd +import pickle +import os +from datetime import datetime +import numpy as np + +def filter_anomalies(train_path, anomaly_dict_path, station_name="Kontaktieren", output_dir="dataset/anomaly_filtered_data"): + """ + Filter out anomaly data points from the training dataset based on anomaly time ranges. + + Args: + train_path (str): Path to the training Parquet file + anomaly_dict_path (str): Path to the anomaly dictionary pickle file + station_name (str): Name of the station to filter anomalies for + output_dir (str): Directory to save the filtered dataset + """ + print(f"Loading training data from: {train_path}") + train_df = pd.read_parquet(train_path) + print(f"Training data loaded. Total rows: {len(train_df)}") + + print(f"Loading anomaly dictionary from: {anomaly_dict_path}") + with open(anomaly_dict_path, 'rb') as f: + anomaly_dict = pickle.load(f) + + # Print structure of the first anomaly to debug + print(f"Structure of anomaly_dict: {type(anomaly_dict)}") + if station_name in anomaly_dict and len(anomaly_dict[station_name]) > 0: + print(f"First anomaly structure: {type(anomaly_dict[station_name][0])}") + print(f"Example anomaly: {anomaly_dict[station_name][0]}") + + # Check if the station exists in the anomaly dictionary + if station_name not in anomaly_dict: + print(f"Station '{station_name}' not found in anomaly dictionary. Available stations: {list(anomaly_dict.keys())}") + return + + station_anomalies = anomaly_dict[station_name] + print(f"Found {len(station_anomalies)} anomaly time ranges for station '{station_name}'") + + # Create a mask for normal data points (not anomalies) + print("Identifying anomaly data points...") + normal_mask = np.ones(len(train_df), dtype=bool) + + # Convert timestamp column to datetime if needed + if not pd.api.types.is_datetime64_any_dtype(train_df['TimeStamp']): + train_df['TimeStamp'] = pd.to_datetime(train_df['TimeStamp']) + + # Check each anomaly time range + for anomaly_period in station_anomalies: + # Handle tuple structure (assuming first element is start time, second is end time) + if isinstance(anomaly_period, tuple) and len(anomaly_period) >= 2: + start_time = pd.to_datetime(anomaly_period[0]) + end_time = pd.to_datetime(anomaly_period[1]) + else: + print(f"Unexpected anomaly format: {anomaly_period}, skipping...") + continue + + # Find rows that fall within this anomaly period + anomaly_mask = (train_df['TimeStamp'] >= start_time) & (train_df['TimeStamp'] <= end_time) + # Update the normal mask + normal_mask = normal_mask & ~anomaly_mask + + # Filter out anomaly data points + filtered_df = train_df[normal_mask] + + print(f"Removed {len(train_df) - len(filtered_df)} anomaly data points.") + print(f"Filtered dataset has {len(filtered_df)} normal data points.") + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Save the filtered dataset + output_path = os.path.join(output_dir, "train_normal.parquet") + print(f"Saving filtered dataset to: {output_path}") + filtered_df.to_parquet(output_path, index=False) + + print("Filtering complete!") + + return filtered_df + +if __name__ == "__main__": + train_path = "dataset/split_data/train.parquet" + anomaly_dict_path = "dataset/origin_data/anomaly_dict_merged.pkl" + + filter_anomalies(train_path, anomaly_dict_path) \ No newline at end of file diff --git a/data_preprocess/unsupervise/anomaly_labeld_testData.py b/data_preprocess/unsupervise/anomaly_labeld_testData.py new file mode 100644 index 000000000..9fc2efb37 --- /dev/null +++ b/data_preprocess/unsupervise/anomaly_labeld_testData.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd +import pickle +import os +import numpy as np + +def label_anomalies(test_path, anomaly_dict_path, station_name="Kontaktieren", output_dir="dataset/anomaly_labeld_testData"): + """ + Label test data points as anomalies (1) or normal (0) based on anomaly time ranges. + + Args: + test_path (str): Path to the test Parquet file + anomaly_dict_path (str): Path to the anomaly dictionary pickle file + station_name (str): Name of the station to get anomalies for + output_dir (str): Directory to save the labeled dataset + """ + print(f"Loading test data from: {test_path}") + test_df = pd.read_parquet(test_path) + print(f"Test data loaded. Total rows: {len(test_df)}") + + print(f"Loading anomaly dictionary from: {anomaly_dict_path}") + with open(anomaly_dict_path, 'rb') as f: + anomaly_dict = pickle.load(f) + + # Check if the station exists in the anomaly dictionary + if station_name not in anomaly_dict: + print(f"Station '{station_name}' not found in anomaly dictionary. Available stations: {list(anomaly_dict.keys())}") + return + + station_anomalies = anomaly_dict[station_name] + print(f"Found {len(station_anomalies)} anomaly time ranges for station '{station_name}'") + + # Convert timestamp column to datetime if needed + if not pd.api.types.is_datetime64_any_dtype(test_df['TimeStamp']): + test_df['TimeStamp'] = pd.to_datetime(test_df['TimeStamp']) + + # Initialize label column with 0 (normal) + test_df['label'] = 0 + + # Create a copy of the DataFrame with only TimeStamp and label columns + labeled_df = test_df[['TimeStamp', 'label']].copy() + + print("Labeling anomalies in test data...") + + # Count anomalies for reporting + anomaly_count = 0 + + # Check each anomaly time range + for anomaly_period in station_anomalies: + # Handle tuple structure (start time, end time) + if isinstance(anomaly_period, tuple) and len(anomaly_period) >= 2: + start_time = pd.to_datetime(anomaly_period[0]) + end_time = pd.to_datetime(anomaly_period[1]) + else: + print(f"Unexpected anomaly format: {anomaly_period}, skipping...") + continue + + # Find rows that fall within this anomaly period and label them as 1 (anomaly) + anomaly_mask = (labeled_df['TimeStamp'] >= start_time) & (labeled_df['TimeStamp'] <= end_time) + labeled_df.loc[anomaly_mask, 'label'] = 1 + + # Count newly identified anomalies + anomaly_count += anomaly_mask.sum() + + print(f"Labeling complete. Found {anomaly_count} anomaly data points out of {len(labeled_df)} total.") + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Save the labeled dataset + output_path = os.path.join(output_dir, "test_labeled.parquet") + print(f"Saving labeled dataset to: {output_path}") + labeled_df.to_parquet(output_path, index=False) + + # Optional: Save a CSV version for easier inspection + csv_output_path = os.path.join(output_dir, "test_labeled.csv") + labeled_df.to_csv(csv_output_path, index=False) + print(f"Also saved as CSV for easier inspection: {csv_output_path}") + + # Print summary statistics + print("\nSummary:") + print(f"Total data points: {len(labeled_df)}") + print(f"Normal data points (label=0): {(labeled_df['label'] == 0).sum()}") + print(f"Anomaly data points (label=1): {(labeled_df['label'] == 1).sum()}") + print(f"Anomaly percentage: {(labeled_df['label'] == 1).sum() / len(labeled_df) * 100:.2f}%") + + print("Labeling process complete!") + + return labeled_df + +if __name__ == "__main__": + test_path = "dataset/split_data/test.parquet" + anomaly_dict_path = "dataset/origin_data/anomaly_dict_merged.pkl" + + label_anomalies(test_path, anomaly_dict_path) \ No newline at end of file diff --git a/data_preprocess/unsupervise/feature_correlation_analysis.py b/data_preprocess/unsupervise/feature_correlation_analysis.py new file mode 100644 index 000000000..fc462c674 --- /dev/null +++ b/data_preprocess/unsupervise/feature_correlation_analysis.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +特征相关性分析和冗余特征删除 +""" + +import pandas as pd +import numpy as np +import os +from typing import List, Tuple, Set +from utils.logger import setup_logger +import warnings + +warnings.filterwarnings('ignore') + +def load_training_data(file_path: str) -> pd.DataFrame: + """ + 加载训练数据 + + Args: + file_path: 训练数据文件路径 + + Returns: + pd.DataFrame: 加载的训练数据 + """ + logger = setup_logger("load_training_data") + + try: + logger.info(f"Loading training data from: {file_path}") + df = pd.read_parquet(file_path) + logger.info(f"Successfully loaded training data. Shape: {df.shape}") + logger.info(f"Columns: {list(df.columns)}") + logger.info(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") + + return df + + except Exception as e: + logger.error(f"Error loading training data: {str(e)}") + raise + + +def calculate_correlation_matrix(df: pd.DataFrame, exclude_cols: List[str] = None) -> pd.DataFrame: + """ + 计算特征相关性矩阵 + + Args: + df: 输入数据框 + exclude_cols: 需要排除的列名列表(如时间戳、标签等) + + Returns: + pd.DataFrame: 相关性矩阵 + """ + logger = setup_logger("calculate_correlation_matrix") + + if exclude_cols is None: + exclude_cols = [] + + try: + # 选择数值型列进行相关性分析 + numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() + + # 排除指定的列 + analysis_cols = [col for col in numeric_cols if col not in exclude_cols] + + logger.info(f"Calculating correlation matrix for {len(analysis_cols)} features") + logger.info(f"Excluded columns: {exclude_cols}") + + # 计算相关性矩阵 + correlation_matrix = df[analysis_cols].corr() + + logger.info(f"Correlation matrix shape: {correlation_matrix.shape}") + + # 保存相关性矩阵 + output_dir = "experiments/intermediate_results" + os.makedirs(output_dir, exist_ok=True) + correlation_file = os.path.join(output_dir, "correlation_matrix.parquet") + correlation_matrix.to_parquet(correlation_file) + logger.info(f"Correlation matrix saved to: {correlation_file}") + + return correlation_matrix + + except Exception as e: + logger.error(f"Error calculating correlation matrix: {str(e)}") + raise + + +def find_highly_correlated_features(correlation_matrix: pd.DataFrame, threshold: float = 0.95) -> List[Tuple[str, str, float]]: + """ + 找出高度相关的特征对 + + Args: + correlation_matrix: 相关性矩阵 + threshold: 相关性阈值 + + Returns: + List[Tuple[str, str, float]]: 高度相关的特征对列表 (feature1, feature2, correlation) + """ + logger = setup_logger("find_highly_correlated_features") + + try: + logger.info(f"Finding highly correlated features with threshold: {threshold}") + + highly_correlated_pairs = [] + + # 遍历相关性矩阵的上三角部分 + for i in range(len(correlation_matrix.columns)): + for j in range(i+1, len(correlation_matrix.columns)): + feature1 = correlation_matrix.columns[i] + feature2 = correlation_matrix.columns[j] + correlation = correlation_matrix.iloc[i, j] + + # 检查是否超过阈值 + if abs(correlation) > threshold and not pd.isna(correlation): + highly_correlated_pairs.append((feature1, feature2, correlation)) + + logger.info(f"Found {len(highly_correlated_pairs)} highly correlated feature pairs") + + # 按相关性大小排序 + highly_correlated_pairs.sort(key=lambda x: abs(x[2]), reverse=True) + + # 记录前10个最相关的特征对 + logger.info("Top 10 highly correlated feature pairs:") + for i, (feat1, feat2, corr) in enumerate(highly_correlated_pairs[:10]): + logger.info(f" {i+1}. {feat1} <-> {feat2}: {corr:.4f}") + + return highly_correlated_pairs + + except Exception as e: + logger.error(f"Error finding highly correlated features: {str(e)}") + raise + + +def select_features_to_remove(highly_correlated_pairs: List[Tuple[str, str, float]], + df: pd.DataFrame) -> Set[str]: + """ + 选择需要删除的冗余特征 + + Args: + highly_correlated_pairs: 高度相关的特征对列表 + df: 原始数据框(用于计算特征的方差等统计信息) + + Returns: + Set[str]: 需要删除的特征名集合 + """ + logger = setup_logger("select_features_to_remove") + + try: + logger.info("Selecting redundant features to remove") + + features_to_remove = set() + + # 计算每个特征的方差(用于决定保留哪个特征) + feature_variance = df.select_dtypes(include=[np.number]).var() + + for feature1, feature2, correlation in highly_correlated_pairs: + # 如果两个特征都还没有被标记为删除 + if feature1 not in features_to_remove and feature2 not in features_to_remove: + # 保留方差更大的特征(信息量更丰富) + if feature_variance[feature1] >= feature_variance[feature2]: + features_to_remove.add(feature2) + logger.info(f"Marked {feature2} for removal (corr with {feature1}: {correlation:.4f})") + else: + features_to_remove.add(feature1) + logger.info(f"Marked {feature1} for removal (corr with {feature2}: {correlation:.4f})") + + logger.info(f"Total features to remove: {len(features_to_remove)}") + logger.info(f"Features to remove: {sorted(list(features_to_remove))}") + + # 保存要删除的特征列表 + output_dir = "experiments/intermediate_results" + os.makedirs(output_dir, exist_ok=True) + + # 保存为文本文件便于查看 + features_file = os.path.join(output_dir, "features_to_remove.txt") + with open(features_file, 'w') as f: + for feature in sorted(features_to_remove): + f.write(f"{feature}\n") + logger.info(f"Features to remove saved to: {features_file}") + + return features_to_remove + + except Exception as e: + logger.error(f"Error selecting features to remove: {str(e)}") + raise + + +def remove_redundant_features(df: pd.DataFrame, features_to_remove: Set[str]) -> pd.DataFrame: + """ + 从数据框中删除冗余特征 + + Args: + df: 原始数据框 + features_to_remove: 需要删除的特征名集合 + + Returns: + pd.DataFrame: 删除冗余特征后的数据框 + """ + logger = setup_logger("remove_redundant_features") + + try: + original_shape = df.shape + logger.info(f"Original data shape: {original_shape}") + + # 删除冗余特征 + df_filtered = df.drop(columns=list(features_to_remove), errors='ignore') + + new_shape = df_filtered.shape + logger.info(f"Filtered data shape: {new_shape}") + logger.info(f"Removed {original_shape[1] - new_shape[1]} features") + + return df_filtered + + except Exception as e: + logger.error(f"Error removing redundant features: {str(e)}") + raise + + +def save_filtered_data(df: pd.DataFrame, output_path: str): + """ + 保存过滤后的数据 + + Args: + df: 过滤后的数据框 + output_path: 输出文件路径 + """ + logger = setup_logger("save_filtered_data") + + try: + # 确保输出目录存在 + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + logger.info(f"Saving filtered data to: {output_path}") + df.to_parquet(output_path, index=False) + + file_size = os.path.getsize(output_path) / 1024**2 + logger.info(f"Successfully saved filtered data. File size: {file_size:.2f} MB") + + except Exception as e: + logger.error(f"Error saving filtered data: {str(e)}") + raise + + +def process_feature_correlation_analysis(train_data_path: str = "dataset/anomaly_filtered_Traindata/train_normal.parquet", + test_data_path: str = "dataset/split_data/test.parquet", + correlation_threshold: float = 0.95, + exclude_cols: List[str] = None): + """ + 主函数:执行完整的特征相关性分析和冗余特征删除流程 + + Args: + train_data_path: 训练数据路径 + test_data_path: 测试数据路径 + correlation_threshold: 相关性阈值 + exclude_cols: 需要排除的列名列表 + """ + main_logger = setup_logger("feature_correlation_analysis") + + try: + main_logger.info("=== Starting Feature Correlation Analysis ===") + main_logger.info(f"Train data path: {train_data_path}") + main_logger.info(f"Test data path: {test_data_path}") + main_logger.info(f"Correlation threshold: {correlation_threshold}") + + if exclude_cols is None: + exclude_cols = ['timestamp', 'date', 'label', 'anomaly'] # 常见的非特征列 + + # 步骤1: 加载训练数据 + main_logger.info("Step 1: Loading training data") + train_df = load_training_data(train_data_path) + + # 步骤2: 计算相关性矩阵 + main_logger.info("Step 2: Calculating correlation matrix") + correlation_matrix = calculate_correlation_matrix(train_df, exclude_cols) + + # 步骤3: 找出高度相关的特征对 + main_logger.info("Step 3: Finding highly correlated features") + highly_correlated_pairs = find_highly_correlated_features(correlation_matrix, correlation_threshold) + + # 步骤4: 选择需要删除的特征 + main_logger.info("Step 4: Selecting features to remove") + features_to_remove = select_features_to_remove(highly_correlated_pairs, train_df) + + if not features_to_remove: + main_logger.info("No highly correlated features found. No features to remove.") + return + + # 步骤5: 从训练数据中删除冗余特征 + main_logger.info("Step 5: Removing redundant features from training data") + train_df_filtered = remove_redundant_features(train_df, features_to_remove) + + # 保存过滤后的训练数据 + output_dir = "dataset/feature_filtered_data" + train_output_path = os.path.join(output_dir, "train_normal_filtered.parquet") + save_filtered_data(train_df_filtered, train_output_path) + + # 步骤6: 处理测试数据 + main_logger.info("Step 6: Processing test data") + if os.path.exists(test_data_path): + main_logger.info(f"Loading test data from: {test_data_path}") + test_df = pd.read_parquet(test_data_path) + main_logger.info(f"Test data shape: {test_df.shape}") + + # 从测试数据中删除相同的特征 + test_df_filtered = remove_redundant_features(test_df, features_to_remove) + + # 保存过滤后的测试数据 + test_output_path = os.path.join(output_dir, "test_filtered.parquet") + save_filtered_data(test_df_filtered, test_output_path) + else: + main_logger.warning(f"Test data file not found: {test_data_path}") + + main_logger.info("=== Feature Correlation Analysis Completed Successfully ===") + main_logger.info(f"Filtered data saved to: {output_dir}") + + except Exception as e: + main_logger.error(f"Error in feature correlation analysis: {str(e)}") + raise + + +if __name__ == "__main__": + # 执行特征相关性分析 + process_feature_correlation_analysis() \ No newline at end of file diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.md" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.md" new file mode 100644 index 000000000..ff48c73e0 --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.md" @@ -0,0 +1,40 @@ +venv) wanting@jerry:~/Time-Series-Library$ pym contact_final_analysis.py +▶️ Running: python3 -m contact_final_analysis +🔍 Contact数据集 - 最终分析报告 +============================================================ +✅ 数据加载成功 + +📊 数据基本信息: + 训练数据: (365254, 28) + 测试数据: (134700, 28) + 测试标签: (134700, 2) + 有效特征数: 27 + 训练数据NaN: 0 + 测试数据NaN: 0 + +🎯 异常分布: + 正常样本: 127749 (94.8%) + 异常样本: 6951 (5.2%) + +🔍 特征区分力分析: + Top 5 最有区分力的特征: + 1. aVoltage_L3_N: 0.8012 + 2. aVoltage_L1_N: 0.7776 + 3. aVoltage_L2_N: 0.7712 + 4. aCosPhi_L3: 0.5839 + 5. aCosPhi_L1: 0.5766 + 平均区分力: 0.4186 + +🔗 相关性分析: + 高相关性特征对 (>0.9): 41 + +🧮 PCA分析: + 前5个主成分解释方差: [0.51600832 0.14777461 0.11308165 0.07439796 0.04619874] + 累计解释方差: 0.985 + +⏰ 时序模式分析: + 分析特征: aVoltage_L3_N + 时序模式差异: 0.2278 + +🎯 最终建议: + 异常检测难度: 相对容易 diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.py" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.py" new file mode 100644 index 000000000..001614eb9 --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/contact_final_analysis.py" @@ -0,0 +1,179 @@ +#!/usr/bin/env python +""" +Contact数据集最终分析脚本 +整合了之前版本的优点,提供可靠的分析结果 +""" + +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +from sklearn.impute import SimpleImputer +import warnings +warnings.filterwarnings('ignore') + +def analyze_contact_data(): + """ + Contact数据集的确定性分析 + """ + print("🔍 Contact数据集 - 最终分析报告") + print("="*60) + + # 1. 加载数据 + try: + train_df = pd.read_parquet("./dataset/anomaly_filtered_Traindata/train_normal.parquet") + test_df = pd.read_parquet("./dataset/split_data/test.parquet") + test_labels = pd.read_parquet("./dataset/anomaly_labeld_testData/test_labeled.parquet") + print("✅ 数据加载成功") + except Exception as e: + print(f"❌ 数据加载失败: {e}") + return + + # 2. 数据基本信息 + print(f"\n📊 数据基本信息:") + print(f" 训练数据: {train_df.shape}") + print(f" 测试数据: {test_df.shape}") + print(f" 测试标签: {test_labels.shape}") + + # 3. 获取有效特征列 + exclude_cols = ['TimeStamp', 'timestamp', 'index', 'Unnamed: 0'] + numeric_cols = [col for col in train_df.select_dtypes(include=[np.number]).columns + if col not in exclude_cols] + print(f" 有效特征数: {len(numeric_cols)}") + + # 4. 数据质量检查 + train_nans = train_df[numeric_cols].isnull().sum().sum() + test_nans = test_df[numeric_cols].isnull().sum().sum() + print(f" 训练数据NaN: {train_nans}") + print(f" 测试数据NaN: {test_nans}") + + # 6. 异常分布分析 + normal_mask = test_labels['label'] == 0 + anomaly_mask = test_labels['label'] == 1 + + print(f"\n🎯 异常分布:") + print(f" 正常样本: {normal_mask.sum()} ({normal_mask.sum()/len(test_labels)*100:.1f}%)") + print(f" 异常样本: {anomaly_mask.sum()} ({anomaly_mask.sum()/len(test_labels)*100:.1f}%)") + + # 7. 特征区分力分析 + print(f"\n🔍 特征区分力分析:") + feature_scores = [] + + for col in numeric_cols: + normal_data = test_df[normal_mask][col] + anomaly_data = test_df[anomaly_mask][col] + + # 标准化均值差异 + mean_diff = abs(normal_data.mean() - anomaly_data.mean()) + std_pooled = np.sqrt((normal_data.std()**2 + anomaly_data.std()**2) / 2) + 1e-8 + + discrimination_score = mean_diff / std_pooled + feature_scores.append((col, discrimination_score)) + + # 排序特征 + feature_scores.sort(key=lambda x: x[1], reverse=True) + + print(f" Top 5 最有区分力的特征:") + for i, (feature, score) in enumerate(feature_scores[:5]): + print(f" {i+1}. {feature}: {score:.4f}") + + avg_score = np.mean([score for _, score in feature_scores]) + print(f" 平均区分力: {avg_score:.4f}") + + # 8. 相关性分析 + print(f"\n🔗 相关性分析:") + corr_matrix = train_df[numeric_cols].corr() + high_corr_count = 0 + + for i in range(len(corr_matrix.columns)): + for j in range(i+1, len(corr_matrix.columns)): + if abs(corr_matrix.iloc[i, j]) > 0.9: + high_corr_count += 1 + + print(f" 高相关性特征对 (>0.9): {high_corr_count}") + + # 9. PCA分析 + print(f"\n🧮 PCA分析:") + scaler = StandardScaler() + train_scaled = scaler.fit_transform(train_df[numeric_cols]) + test_scaled = scaler.transform(test_df[numeric_cols]) + + n_components = min(10, len(numeric_cols)) + pca = PCA(n_components=n_components) + train_pca = pca.fit_transform(train_scaled) + test_pca = pca.transform(test_scaled) + + print(f" 前5个主成分解释方差: {pca.explained_variance_ratio_[:5]}") + print(f" 累计解释方差: {pca.explained_variance_ratio_.sum():.3f}") + + # 10. 时序模式分析(使用最佳特征) + print(f"\n⏰ 时序模式分析:") + best_feature = feature_scores[0][0] # 最有区分力的特征 + print(f" 分析特征: {best_feature}") + + # 采样数据进行自相关分析 + sample_size = min(1000, normal_mask.sum()) + normal_sample = test_df[normal_mask][best_feature].iloc[:sample_size] + anomaly_sample = test_df[anomaly_mask][best_feature].iloc[:min(1000, anomaly_mask.sum())] + + def autocorr(x, max_lag=50): + """计算自相关""" + x = x - x.mean() # 去中心化 + result = np.correlate(x, x, mode='full') + result = result[len(result)//2:] + if result[0] != 0: + result = result / result[0] + return result[:max_lag] + + normal_autocorr = autocorr(normal_sample.values) + anomaly_autocorr = autocorr(anomaly_sample.values) + + autocorr_diff = np.mean(np.abs(normal_autocorr - anomaly_autocorr)) + print(f" 时序模式差异: {autocorr_diff:.4f}") + + # 11. 最终建议 + print(f"\n🎯 最终建议:") + + if avg_score < 0.1: + difficulty = "困难" + seq_len_rec = "300-500" + epochs_rec = "25-30" + model_rec = "深度模型 (e_layers>=6)" + elif avg_score < 0.2: + difficulty = "中等" + seq_len_rec = "200-300" + epochs_rec = "20-25" + model_rec = "标准模型 (e_layers=4-6)" + else: + difficulty = "相对容易" + seq_len_rec = "100-200" + epochs_rec = "15-20" + model_rec = "标准模型即可" + + print(f" 异常检测难度: {difficulty}") + print(f" 推荐序列长度: {seq_len_rec}") + print(f" 推荐训练轮数: {epochs_rec}") + print(f" 推荐模型配置: {model_rec}") + + if autocorr_diff < 0.1: + print(f" ⚠️ 时序模式差异较小,建议增强特征工程") + else: + print(f" ✅ 时序模式有差异,模型应该能够学习") + + if high_corr_count > 20: + print(f" ⚠️ 特征冗余较多,建议特征选择") + + print(f"\n✅ 分析完成!基于这个分析结果进行模型训练。") + + # 返回关键指标 + return { + 'avg_discrimination': avg_score, + 'temporal_difference': autocorr_diff, + 'high_correlation_pairs': high_corr_count, + 'best_feature': best_feature, + 'anomaly_ratio': anomaly_mask.sum() / len(test_labels), + 'difficulty': difficulty + } + +if __name__ == "__main__": + results = analyze_contact_data() \ No newline at end of file diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.md" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.md" new file mode 100644 index 000000000..328b39297 --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.md" @@ -0,0 +1,174 @@ +[2025-06-25 22:39:03,954] [INFO] [default] - Logger initialized. Log file: experiments/logs/default_20250625_223903.log +[2025-06-25 22:39:03,954] [INFO] [default] - 🚀 开始重构异常检测失败原因分析... +[2025-06-25 22:39:03,954] [INFO] [default] - 📂 加载Contact数据... +[2025-06-25 22:39:04,830] [INFO] [default] - 训练数据形状: (365254, 27) +[2025-06-25 22:39:04,830] [INFO] [default] - 测试数据形状: (134700, 27) +[2025-06-25 22:39:04,831] [INFO] [default] - 异常比例: 5.16% +[2025-06-25 22:39:04,834] [INFO] [default] - 🔍 分析数据特性... +[2025-06-25 22:39:04,838] [INFO] [default] - === 数据复杂性分析 === +[2025-06-25 22:39:04,866] [INFO] [default] - 高相关性特征对数量 (>0.9): 41 +| 分析项 | 发现 | +| ------ | ------------------------------- | +| 高相关特征对 | 41对,说明存在强共线性,可导致模型“靠一个变量猜出其他变量” | +| 低方差特征 | 0,说明无冗余静态特征 | +| 高偏度特征 | 3 个,可能影响建模稳定性 | + +[2025-06-25 22:39:04,887] [INFO] [default] - 低方差特征数量 (<0.01): 0 +[2025-06-25 22:39:04,920] [INFO] [default] - 高偏度特征数量 (|skew|>2): 3 +[2025-06-25 22:39:04,920] [INFO] [default] - +=== 可重构性分析 === +[2025-06-25 22:39:05,566] [INFO] [default] - 正常样本平均复杂度: 9.197748 +[2025-06-25 22:39:05,566] [INFO] [default] - 异常样本平均复杂度: 6.389511 +解释:你希望异常点“更难重构”,但它们恰恰“更容易”被模型复现。 + +系统警告:可能是模型过度泛化,或异常点结构过于规整(low-complexity)。 +[2025-06-25 22:39:05,566] [INFO] [default] - 复杂度比例: 0.695 +[2025-06-25 22:39:05,597] [INFO] [default] - 正常样本与训练集平均相似性: -0.0205 +[2025-06-25 22:39:05,597] [INFO] [default] - 异常样本与训练集平均相似性: -0.1142 +[2025-06-25 22:39:05,597] [INFO] [default] - 🧪 测试简单重构基线... +[2025-06-25 22:39:05,597] [INFO] [default] - 测试线性自编码器... +[2025-06-25 22:39:07,687] [INFO] [default] - P90阈值: F1=0.0145, P=0.0110, R=0.0213 +[2025-06-25 22:39:07,692] [INFO] [default] - P95阈值: F1=0.0025, P=0.0025, R=0.0024 +[2025-06-25 22:39:07,697] [INFO] [default] - P99阈值: F1=0.0019, P=0.0059, R=0.0012 +[2025-06-25 22:39:07,701] [INFO] [default] - P99.5阈值: F1=0.0021, P=0.0119, R=0.0012 +[2025-06-25 22:39:07,702] [INFO] [default] - 测试PCA重构... +[2025-06-25 22:39:07,818] [INFO] [default] - PCA-10: F1=0.0158, 解释方差=0.985 +[2025-06-25 22:39:07,865] [INFO] [default] - PCA-20: F1=0.0113, 解释方差=1.000 +[2025-06-25 22:39:07,906] [INFO] [default] - PCA-13: F1=0.0022, 解释方差=0.996 +[2025-06-25 22:39:07,906] [INFO] [default] - 测试马哈拉诺比斯距离... +[2025-06-25 22:39:07,957] [INFO] [default] - 马哈拉诺比斯距离: F1=0.0089 +[2025-06-25 22:39:07,957] [INFO] [default] - +AE、PCA、Mahalanobis 全部失败(F1 < 0.02); + +说明:异常点和正常点在低维线性空间中无法有效分离。 + + +=== 重构失败原因分析 === +[2025-06-25 22:39:07,959] [INFO] [default] - 正常样本重构误差: 均值=9.464100, 标准差=8.400022 +[2025-06-25 22:39:07,959] [INFO] [default] - 异常样本重构误差: 均值=6.576734, 标准差=7.426354 +[2025-06-25 22:39:07,960] [INFO] [default] - 误差分离度: -0.344 +✅ 结论:异常样本是“局部规整 + 分布边缘”,对线性模型并不构成挑战,因此被轻易重构。 + + +[2025-06-25 22:39:07,960] [WARNING] [default] - ⚠️ 异常样本的重构误差与正常样本相近,模型可能过度泛化 +[2025-06-25 22:39:07,964] [INFO] [default] - 🔍 分析TimesNet特定问题... +[2025-06-25 22:39:07,964] [INFO] [default] - === 时序长度分析 === +[2025-06-25 22:39:07,965] [INFO] [default] - seq_len=50: 窗口平均方差=1.561226 +[2025-06-25 22:39:07,967] [INFO] [default] - seq_len=100: 窗口平均方差=1.501821 +[2025-06-25 22:39:07,968] [INFO] [default] - seq_len=200: 窗口平均方差=1.366831 +[2025-06-25 22:39:07,968] [INFO] [default] - + +窗口越长,时序越平滑 → 异常点的冲击被“平均掉”。 + +若异常是短时脉冲,长窗口不利于检测。 + +✅ 建议: + +使用 短窗口(如 30~50)+ 滑窗推进; + +或改为 多尺度窗口建模(结合 long + short) + +=== 周期性分析 === +[2025-06-25 22:39:07,975] [INFO] [default] - 特征0的主要周期: [np.float64(480.0), np.float64(720.0), np.float64(1440.0)] +[2025-06-25 22:39:07,975] [INFO] [default] - 特征1的主要周期: [np.float64(480.0), np.float64(720.0), np.float64(1440.0)] +[2025-06-25 22:39:07,975] [INFO] [default] - 特征2的主要周期: [np.float64(480.0), np.float64(720.0), np.float64(1440.0)] + +TimesNet适用于此类多周期数据,周期提取有效; + +问题可能不是周期建模失败,而是异常本身对周期扰动小。 + + +[2025-06-25 22:39:07,975] [INFO] [default] - +=== 特征重要性分析 === +[2025-06-25 22:39:08,010] [INFO] [default] - Top 5 重要特征: +[2025-06-25 22:39:08,011] [INFO] [default] - 特征10: 方差=1.0000, 区分力=0.8012 +[2025-06-25 22:39:08,011] [INFO] [default] - 特征8: 方差=1.0000, 区分力=0.7777 +[2025-06-25 22:39:08,011] [INFO] [default] - 特征9: 方差=1.0000, 区分力=0.7712 +[2025-06-25 22:39:08,011] [INFO] [default] - 特征22: 方差=1.0000, 区分力=0.5839 +[2025-06-25 22:39:08,011] [INFO] [default] - 特征20: 方差=1.0000, 区分力=0.5766 +[2025-06-25 22:39:08,011] [INFO] [default] - + +✅ 存在少量特征对异常具有强感知能力; + +❗但若模型训练只依赖这几维(e.g., 通过通道注意力),将忽略全局时序结构 → 出现 overfitting shortcut。 + +=== 数据质量问题 === +[2025-06-25 22:39:08,104] [INFO] [default] - 平均异常值比例: 2.92% +[2025-06-25 22:39:08,108] [INFO] [default] - 可能非平稳的特征数: 2/10 +[2025-06-25 22:39:08,108] [INFO] [default] - 💡 提出解决方案... +[2025-06-25 22:39:08,108] [INFO] [default] - 建议的解决方案: +[2025-06-25 22:39:08,108] [INFO] [default] - 1. 异常样本复杂度不足,考虑重新定义异常标准 +[2025-06-25 22:39:08,108] [INFO] [default] - 2. 使用动态阈值或多阈值融合策略 +[2025-06-25 22:39:08,108] [INFO] [default] - 3. 考虑使用训练数据的重构误差分布来设定阈值 +[2025-06-25 22:39:08,108] [INFO] [default] - 📋 生成综合分析报告... +[2025-06-25 22:39:08,108] [INFO] [default] - 📄 分析报告已保存: reconstruction_failure_analysis.md +[2025-06-25 22:39:08,108] [INFO] [default] - ✅ 分析完成! + +# TimesNet重构异常检测失败原因分析报告 + +## 🎯 核心发现 + +### 数据特性问题 +- 高相关性特征对: 41 +- 低方差特征数: 0 +- 异常样本复杂度比例: 0.695 +- 异常样本相似性: -0.114 + +### 基线方法性能 +- linear_ae_p90: F1=0.0145 +- linear_ae_p95: F1=0.0025 +- linear_ae_p99: F1=0.0019 +- linear_ae_p99.5: F1=0.0021 +- pca_10: F1=0.0158 +- pca_20: F1=0.0113 +- pca_13: F1=0.0022 +- mahalanobis: F1=0.0089 + +### TimesNet特定问题 +- 平均异常值比例: 2.92% +- 非平稳特征数: 2/10 +- 数据质量问题: 无 + +## 🔍 失败原因分析 + +### 1. 数据本身的问题 +- **高维度低区分度**: 27个特征中可能存在大量冗余信息 +- **异常样本不够"异常"**: 异常样本与正常样本在特征空间中距离不够远 +- **训练数据过度平滑**: 过滤异常后的训练数据可能过于"完美" + +### 2. 重构任务的固有困难 +- **过度泛化**: 模型学会重构所有样本,包括异常样本 +- **表示能力过强**: TimesNet的表示能力可能超过了数据的复杂度 +- **阈值设定困难**: 重构误差分布重叠严重 + +### 3. TimesNet架构不匹配 +- **周期性假设**: TimesNet假设存在周期性,但Contact数据可能缺乏明显周期 +- **时序依赖性**: 异常可能是瞬时的,不依赖长期时序模式 + +## 💡 解决方案 +1. 异常样本复杂度不足,考虑重新定义异常标准 +2. 使用动态阈值或多阈值融合策略 +3. 考虑使用训练数据的重构误差分布来设定阈值 + +## 🎯 关键结论 + +**TimesNet在Contact数据上失败的根本原因可能是:** +1. **数据特性不匹配**: Contact数据缺乏TimesNet擅长的周期性模式 +2. **异常定义问题**: 当前的异常标注可能不够明确或一致 +3. **重构范式局限**: 对于这类数据,重构可能不是最佳的异常检测方式 + +**建议采用的替代方案:** +- 基于密度的异常检测 (如Isolation Forest) +- 基于距离的方法 (如LOF) +- 简化的神经网络架构 +- 重新审视异常标注标准 + +重构异常检测失效的根本原因在于:异常样本呈现低复杂度、低方差的拟态行为,且可被线性模型轻易重构。同时,强相关特征结构导致模型可利用冗余信息推断各通道,造成过度泛化。此外,使用较长的时间窗口进一步稀释了异常信号,使重构误差分离度反转,无法作为有效判据。 + +| 问题 | 建议 | +| ------- | ------------------------------------ | +| 重构误差不分离 | 改为监督点分类(逐点标签)或残差序列建模 | +| 相关性过高 | 加入通道压缩(通道 attention)或删除冗余特征 | +| 长窗口稀释异常 | 使用短滑窗(30\~60),或结合多尺度 | +| 周期结构弱扰动 | 使用周期增强 loss,如 trend vs seasonal 误差对比 | +| 模型过度泛化 | 增加重构正则(如Dropout、混合扰动、contrastive) | diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.py" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.py" new file mode 100644 index 000000000..6b803345a --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/diagnose_reconstruction_anomaly_detection.py" @@ -0,0 +1,525 @@ +""" +诊断TimesNet重构异常检测失败的深层原因 +分析为什么Contact数据集上的无监督重构方法表现不佳 +""" + +import pandas as pd +import numpy as np +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +import matplotlib.pyplot as plt +import seaborn as sns +from utils.logger import logger +import os + +class ReconstructionAnomalyDiagnostic: + def __init__(self): + self.results = {} + + def load_contact_data(self): + """加载Contact数据""" + logger.info("📂 加载Contact数据...") + + # 加载训练数据(正常样本) + train_data = pd.read_parquet('./dataset/anomaly_filtered_Traindata/train_normal.parquet') + + # 加载测试数据和标签 + test_data = pd.read_parquet('./dataset/split_data/test.parquet') + test_labels = pd.read_parquet('./dataset/anomaly_labeld_testData/test_labeled.parquet') + + # 模拟ContactLoader的预处理 + train_features = train_data.values[:, 1:] # 跳过TimeStamp + test_features = test_data.values[:, 1:] + test_labels_array = test_labels.values[:, 1:].flatten() + + # 标准化 + scaler = StandardScaler() + train_features = scaler.fit_transform(train_features) + test_features = scaler.transform(test_features) + + logger.info(f"训练数据形状: {train_features.shape}") + logger.info(f"测试数据形状: {test_features.shape}") + logger.info(f"异常比例: {test_labels_array.sum()/len(test_labels_array)*100:.2f}%") + + return train_features, test_features, test_labels_array, scaler + + def analyze_data_characteristics(self, train_data, test_data, test_labels): + """分析数据特性对重构的影响""" + logger.info("🔍 分析数据特性...") + + normal_mask = test_labels == 0 + anomaly_mask = test_labels == 1 + + normal_data = test_data[normal_mask] + anomaly_data = test_data[anomaly_mask] + + # 1. 数据复杂性分析 + logger.info("=== 数据复杂性分析 ===") + + # 特征间相关性 + correlation_matrix = np.corrcoef(train_data.T) + high_corr_pairs = np.sum(np.abs(correlation_matrix) > 0.9) - train_data.shape[1] # 排除对角线 + logger.info(f"高相关性特征对数量 (>0.9): {high_corr_pairs // 2}") + + # 特征方差分析 + feature_variances = np.var(train_data, axis=0) + low_variance_features = np.sum(feature_variances < 0.01) + logger.info(f"低方差特征数量 (<0.01): {low_variance_features}") + + # 数据分布偏度 + from scipy.stats import skew, kurtosis + feature_skewness = [skew(train_data[:, i]) for i in range(train_data.shape[1])] + high_skew_features = np.sum(np.abs(feature_skewness) > 2) + logger.info(f"高偏度特征数量 (|skew|>2): {high_skew_features}") + + # 2. 正常vs异常样本的可重构性分析 + logger.info("\n=== 可重构性分析 ===") + + # 计算样本内部方差(复杂度) + normal_complexity = np.mean([np.var(sample) for sample in normal_data]) + anomaly_complexity = np.mean([np.var(sample) for sample in anomaly_data]) + + logger.info(f"正常样本平均复杂度: {normal_complexity:.6f}") + logger.info(f"异常样本平均复杂度: {anomaly_complexity:.6f}") + logger.info(f"复杂度比例: {anomaly_complexity/normal_complexity:.3f}") + + # 计算与训练集的相似性 + from sklearn.metrics.pairwise import cosine_similarity + + # 随机采样计算相似性(避免内存问题) + sample_size = min(1000, len(train_data)) + train_sample = train_data[np.random.choice(len(train_data), sample_size, replace=False)] + + normal_similarities = [] + anomaly_similarities = [] + + for i in range(min(100, len(normal_data))): + sim = cosine_similarity([normal_data[i]], train_sample)[0] + normal_similarities.append(np.mean(sim)) + + for i in range(min(100, len(anomaly_data))): + sim = cosine_similarity([anomaly_data[i]], train_sample)[0] + anomaly_similarities.append(np.mean(sim)) + + logger.info(f"正常样本与训练集平均相似性: {np.mean(normal_similarities):.4f}") + logger.info(f"异常样本与训练集平均相似性: {np.mean(anomaly_similarities):.4f}") + + self.results['data_characteristics'] = { + 'high_correlation_pairs': high_corr_pairs // 2, + 'low_variance_features': low_variance_features, + 'high_skew_features': high_skew_features, + 'normal_complexity': normal_complexity, + 'anomaly_complexity': anomaly_complexity, + 'complexity_ratio': anomaly_complexity/normal_complexity, + 'normal_similarity': np.mean(normal_similarities), + 'anomaly_similarity': np.mean(anomaly_similarities) + } + + def test_simple_reconstruction_baselines(self, train_data, test_data, test_labels): + """测试简单重构基线方法""" + logger.info("🧪 测试简单重构基线...") + + results = {} + + # 1. 线性自编码器 + logger.info("测试线性自编码器...") + + class LinearAutoEncoder(nn.Module): + def __init__(self, input_dim, hidden_dim): + super().__init__() + self.encoder = nn.Linear(input_dim, hidden_dim) + self.decoder = nn.Linear(hidden_dim, input_dim) + + def forward(self, x): + encoded = torch.relu(self.encoder(x)) + decoded = self.decoder(encoded) + return decoded + + # 训练线性自编码器 + input_dim = train_data.shape[1] + hidden_dim = input_dim // 2 + + model = LinearAutoEncoder(input_dim, hidden_dim) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.MSELoss() + + train_tensor = torch.FloatTensor(train_data) + test_tensor = torch.FloatTensor(test_data) + + # 简单训练 + model.train() + for epoch in range(50): + optimizer.zero_grad() + reconstructed = model(train_tensor) + loss = criterion(reconstructed, train_tensor) + loss.backward() + optimizer.step() + + # 测试重构误差 + model.eval() + with torch.no_grad(): + test_reconstructed = model(test_tensor) + reconstruction_errors = torch.mean((test_tensor - test_reconstructed) ** 2, dim=1).numpy() + + # 计算异常检测性能 + thresholds = np.percentile(reconstruction_errors, [90, 95, 99, 99.5]) + + for i, thresh in enumerate(thresholds): + predictions = (reconstruction_errors > thresh).astype(int) + # 确保标签格式正确 + test_labels_clean = test_labels.astype(int) + precision, recall, f1, _ = precision_recall_fscore_support(test_labels_clean, predictions, average='binary') + results[f'linear_ae_p{[90, 95, 99, 99.5][i]}'] = { + 'threshold': thresh, + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + logger.info(f" P{[90, 95, 99, 99.5][i]}阈值: F1={f1:.4f}, P={precision:.4f}, R={recall:.4f}") + + # 2. PCA重构 + logger.info("测试PCA重构...") + from sklearn.decomposition import PCA + + for n_components in [10, 20, input_dim//2]: + if n_components >= input_dim: + continue + + pca = PCA(n_components=n_components) + pca.fit(train_data) + + # 重构测试数据 + test_transformed = pca.transform(test_data) + test_reconstructed_pca = pca.inverse_transform(test_transformed) + + pca_errors = np.mean((test_data - test_reconstructed_pca) ** 2, axis=1) + + thresh = np.percentile(pca_errors, 95) + predictions = (pca_errors > thresh).astype(int) + test_labels_clean = test_labels.astype(int) + precision, recall, f1, _ = precision_recall_fscore_support(test_labels_clean, predictions, average='binary') + + results[f'pca_{n_components}'] = { + 'explained_variance': pca.explained_variance_ratio_.sum(), + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + logger.info(f" PCA-{n_components}: F1={f1:.4f}, 解释方差={pca.explained_variance_ratio_.sum():.3f}") + + # 3. 统计基线:马哈拉诺比斯距离 + logger.info("测试马哈拉诺比斯距离...") + + # 计算训练数据的均值和协方差 + train_mean = np.mean(train_data, axis=0) + train_cov = np.cov(train_data.T) + + # 加入正则化避免奇异矩阵 + train_cov_reg = train_cov + np.eye(train_cov.shape[0]) * 1e-6 + train_cov_inv = np.linalg.inv(train_cov_reg) + + # 计算马哈拉诺比斯距离 + def mahalanobis_distance(x, mean, cov_inv): + diff = x - mean + return np.sqrt(np.sum(diff @ cov_inv * diff, axis=1)) + + mahal_distances = mahalanobis_distance(test_data, train_mean, train_cov_inv) + + thresh = np.percentile(mahal_distances, 95) + predictions = (mahal_distances > thresh).astype(int) + test_labels_clean = test_labels.astype(int) + precision, recall, f1, _ = precision_recall_fscore_support(test_labels_clean, predictions, average='binary') + + results['mahalanobis'] = { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + logger.info(f" 马哈拉诺比斯距离: F1={f1:.4f}") + + self.results['baseline_methods'] = results + + # 分析为什么重构方法失败 + logger.info("\n=== 重构失败原因分析 ===") + + # 检查重构误差分布 + test_labels_clean = test_labels.astype(int) + normal_errors = reconstruction_errors[test_labels_clean == 0] + anomaly_errors = reconstruction_errors[test_labels_clean == 1] + + logger.info(f"正常样本重构误差: 均值={np.mean(normal_errors):.6f}, 标准差={np.std(normal_errors):.6f}") + logger.info(f"异常样本重构误差: 均值={np.mean(anomaly_errors):.6f}, 标准差={np.std(anomaly_errors):.6f}") + logger.info(f"误差分离度: {(np.mean(anomaly_errors) - np.mean(normal_errors)) / np.std(normal_errors):.3f}") + + # 如果异常样本的重构误差不显著高于正常样本,说明模型学习了错误的表示 + if np.mean(anomaly_errors) <= np.mean(normal_errors) * 1.1: + logger.warning("⚠️ 异常样本的重构误差与正常样本相近,模型可能过度泛化") + + def analyze_timesnet_specific_issues(self, train_data, test_data, test_labels): + """分析TimesNet特定的问题""" + logger.info("🔍 分析TimesNet特定问题...") + + # 1. 时序长度问题 + logger.info("=== 时序长度分析 ===") + + # Contact数据的seq_len通常是100,检查这个长度是否合适 + seq_lens_to_test = [50, 100, 200] + + for seq_len in seq_lens_to_test: + if seq_len > len(test_data): + continue + + # 模拟时序窗口 + num_windows = len(test_data) - seq_len + 1 + + # 计算时序窗口内的方差(复杂度) + window_variances = [] + for i in range(min(100, num_windows)): # 只计算前100个窗口 + window = test_data[i:i+seq_len] + window_variance = np.var(window) + window_variances.append(window_variance) + + logger.info(f" seq_len={seq_len}: 窗口平均方差={np.mean(window_variances):.6f}") + + # 2. 周期性分析 + logger.info("\n=== 周期性分析 ===") + + # 检查数据中是否存在明显的周期性模式 + from scipy.fft import fft, fftfreq + + # 对前几个特征进行FFT分析 + for feat_idx in range(min(3, train_data.shape[1])): + signal = train_data[:min(1440, len(train_data)), feat_idx] # 最多取1天的数据 + + # FFT分析 + fft_values = fft(signal) + freqs = fftfreq(len(signal)) + + # 找到主要频率成分 + magnitude = np.abs(fft_values) + dominant_freq_idx = np.argsort(magnitude[1:len(magnitude)//2])[-3:] # 前3个主要频率 + + logger.info(f" 特征{feat_idx}的主要周期: {[1/freqs[idx+1] if freqs[idx+1] != 0 else 'inf' for idx in dominant_freq_idx]}") + + # 3. 特征重要性分析 + logger.info("\n=== 特征重要性分析 ===") + + # 使用方差和区分力分析特征重要性 + feature_importance = {} + + normal_data = test_data[test_labels == 0] + anomaly_data = test_data[test_labels == 1] + + for i in range(train_data.shape[1]): + # 方差(信息量) + variance = np.var(train_data[:, i]) + + # 区分力(正常vs异常) + normal_mean = np.mean(normal_data[:, i]) + anomaly_mean = np.mean(anomaly_data[:, i]) + pooled_std = np.sqrt((np.var(normal_data[:, i]) + np.var(anomaly_data[:, i])) / 2) + discrimination = abs(normal_mean - anomaly_mean) / (pooled_std + 1e-8) + + feature_importance[i] = { + 'variance': variance, + 'discrimination': discrimination, + 'combined_score': variance * discrimination + } + + # 排序特征 + sorted_features = sorted(feature_importance.items(), key=lambda x: x[1]['combined_score'], reverse=True) + + logger.info("Top 5 重要特征:") + for feat_idx, metrics in sorted_features[:5]: + logger.info(f" 特征{feat_idx}: 方差={metrics['variance']:.4f}, 区分力={metrics['discrimination']:.4f}") + + # 4. 数据质量问题 + logger.info("\n=== 数据质量问题 ===") + + quality_issues = [] + + # 检查异常值 + outlier_ratios = [] + for i in range(train_data.shape[1]): + Q1 = np.percentile(train_data[:, i], 25) + Q3 = np.percentile(train_data[:, i], 75) + IQR = Q3 - Q1 + outliers = np.sum((train_data[:, i] < Q1 - 1.5*IQR) | (train_data[:, i] > Q3 + 1.5*IQR)) + outlier_ratio = outliers / len(train_data) + outlier_ratios.append(outlier_ratio) + + avg_outlier_ratio = np.mean(outlier_ratios) + logger.info(f"平均异常值比例: {avg_outlier_ratio*100:.2f}%") + + if avg_outlier_ratio > 0.1: + quality_issues.append("训练数据包含过多异常值") + + # 检查数据平稳性 + non_stationary_features = 0 + for i in range(min(10, train_data.shape[1])): + # 简单的平稳性检查:前后半段均值差异 + first_half = train_data[:len(train_data)//2, i] + second_half = train_data[len(train_data)//2:, i] + + mean_diff = abs(np.mean(first_half) - np.mean(second_half)) + std_pooled = np.sqrt((np.var(first_half) + np.var(second_half)) / 2) + + if mean_diff > 2 * std_pooled: + non_stationary_features += 1 + + logger.info(f"可能非平稳的特征数: {non_stationary_features}/10") + + if non_stationary_features > 5: + quality_issues.append("数据可能非平稳") + + self.results['timesnet_analysis'] = { + 'feature_importance': dict(sorted_features[:10]), + 'avg_outlier_ratio': avg_outlier_ratio, + 'non_stationary_features': non_stationary_features, + 'quality_issues': quality_issues + } + + def propose_solutions(self): + """提出解决方案""" + logger.info("💡 提出解决方案...") + + solutions = [] + + # 基于分析结果提出解决方案 + data_char = self.results['data_characteristics'] + timesnet_analysis = self.results['timesnet_analysis'] + + # 1. 数据预处理改进 + if data_char['high_correlation_pairs'] > 50: + solutions.append("使用PCA或特征选择减少高相关性特征") + + if data_char['low_variance_features'] > 5: + solutions.append("移除低方差特征") + + if data_char['complexity_ratio'] < 1.2: + solutions.append("异常样本复杂度不足,考虑重新定义异常标准") + + # 2. 模型改进 + if data_char['anomaly_similarity'] > 0.8: + solutions.append("异常样本与训练集相似度过高,考虑使用更复杂的模型") + + # 3. 训练策略改进 + best_baseline_f1 = max([result['f1'] for result in self.results['baseline_methods'].values()]) + if best_baseline_f1 > 0.2: + solutions.append(f"简单基线方法表现更好(F1={best_baseline_f1:.3f}),TimesNet可能过于复杂") + + # 4. 数据质量改进 + if timesnet_analysis['quality_issues']: + solutions.extend([f"解决数据质量问题: {issue}" for issue in timesnet_analysis['quality_issues']]) + + # 5. 阈值策略改进 + solutions.append("使用动态阈值或多阈值融合策略") + solutions.append("考虑使用训练数据的重构误差分布来设定阈值") + + logger.info("建议的解决方案:") + for i, solution in enumerate(solutions, 1): + logger.info(f" {i}. {solution}") + + self.results['solutions'] = solutions + + def generate_comprehensive_report(self): + """生成综合报告""" + logger.info("📋 生成综合分析报告...") + + report = f""" +# TimesNet重构异常检测失败原因分析报告 + +## 🎯 核心发现 + +### 数据特性问题 +- 高相关性特征对: {self.results['data_characteristics']['high_correlation_pairs']} +- 低方差特征数: {self.results['data_characteristics']['low_variance_features']} +- 异常样本复杂度比例: {self.results['data_characteristics']['complexity_ratio']:.3f} +- 异常样本相似性: {self.results['data_characteristics']['anomaly_similarity']:.3f} + +### 基线方法性能 +""" + + for method, result in self.results['baseline_methods'].items(): + if 'f1' in result: + report += f"- {method}: F1={result['f1']:.4f}\n" + + report += f""" +### TimesNet特定问题 +- 平均异常值比例: {self.results['timesnet_analysis']['avg_outlier_ratio']*100:.2f}% +- 非平稳特征数: {self.results['timesnet_analysis']['non_stationary_features']}/10 +- 数据质量问题: {', '.join(self.results['timesnet_analysis']['quality_issues']) if self.results['timesnet_analysis']['quality_issues'] else '无'} + +## 🔍 失败原因分析 + +### 1. 数据本身的问题 +- **高维度低区分度**: 27个特征中可能存在大量冗余信息 +- **异常样本不够"异常"**: 异常样本与正常样本在特征空间中距离不够远 +- **训练数据过度平滑**: 过滤异常后的训练数据可能过于"完美" + +### 2. 重构任务的固有困难 +- **过度泛化**: 模型学会重构所有样本,包括异常样本 +- **表示能力过强**: TimesNet的表示能力可能超过了数据的复杂度 +- **阈值设定困难**: 重构误差分布重叠严重 + +### 3. TimesNet架构不匹配 +- **周期性假设**: TimesNet假设存在周期性,但Contact数据可能缺乏明显周期 +- **时序依赖性**: 异常可能是瞬时的,不依赖长期时序模式 + +## 💡 解决方案 +""" + + for i, solution in enumerate(self.results['solutions'], 1): + report += f"{i}. {solution}\n" + + report += """ +## 🎯 关键结论 + +**TimesNet在Contact数据上失败的根本原因可能是:** +1. **数据特性不匹配**: Contact数据缺乏TimesNet擅长的周期性模式 +2. **异常定义问题**: 当前的异常标注可能不够明确或一致 +3. **重构范式局限**: 对于这类数据,重构可能不是最佳的异常检测方式 + +**建议采用的替代方案:** +- 基于密度的异常检测 (如Isolation Forest) +- 基于距离的方法 (如LOF) +- 简化的神经网络架构 +- 重新审视异常标注标准 +""" + + with open('reconstruction_failure_analysis.md', 'w', encoding='utf-8') as f: + f.write(report) + + logger.info("📄 分析报告已保存: reconstruction_failure_analysis.md") + + def run_full_analysis(self): + """运行完整分析""" + logger.info("🚀 开始重构异常检测失败原因分析...") + + # 加载数据 + train_data, test_data, test_labels, scaler = self.load_contact_data() + + # 各项分析 + self.analyze_data_characteristics(train_data, test_data, test_labels) + self.test_simple_reconstruction_baselines(train_data, test_data, test_labels) + self.analyze_timesnet_specific_issues(train_data, test_data, test_labels) + + # 提出解决方案 + self.propose_solutions() + + # 生成报告 + self.generate_comprehensive_report() + + logger.info("✅ 分析完成!") + + return self.results + +def main(): + diagnostic = ReconstructionAnomalyDiagnostic() + results = diagnostic.run_full_analysis() + return results + +if __name__ == "__main__": + main() \ No newline at end of file diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.md" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.md" new file mode 100644 index 000000000..abc0ff335 --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.md" @@ -0,0 +1,68 @@ +wanting@jerry:~/Time-Series-Library$ pym verify_contact_anomaly_definition.py +▶️ Running: python3 -m verify_contact_anomaly_definition +[2025-06-25 21:51:02,337] [INFO] [default] - Logger initialized. Log file: experiments/logs/default_20250625_215102.log +[2025-06-25 21:51:02,337] [INFO] [default] - 验证Contact异常定义... +[2025-06-25 21:51:02,386] [INFO] [default] - 测试数据形状: (134700, 27) +[2025-06-25 21:51:02,386] [INFO] [default] - 异常样本数: 6951 / 134700 (5.16%) +[2025-06-25 21:51:02,414] [INFO] [default] - +=== 统计特性对比 === +[2025-06-25 21:51:02,414] [INFO] [default] - 正常样本统计: +[2025-06-25 21:51:02,414] [INFO] [default] - 数量: 127749 +[2025-06-25 21:51:02,419] [INFO] [default] - 均值范围: [-0.0234, 0.0352] +[2025-06-25 21:51:02,438] [INFO] [default] - 标准差范围: [0.9999, 1.0205] +[2025-06-25 21:51:02,443] [INFO] [default] - 方差总和: 1.0210 +[2025-06-25 21:51:02,443] [INFO] [default] - +异常样本统计: +[2025-06-25 21:51:02,443] [INFO] [default] - 数量: 6951 +[2025-06-25 21:51:02,443] [INFO] [default] - 均值范围: [-0.6471, 0.4305] +[2025-06-25 21:51:02,444] [INFO] [default] - 标准差范围: [0.2747, 0.9940] +[2025-06-25 21:51:02,444] [INFO] [default] - 方差总和: 0.5939 +[2025-06-25 21:51:02,444] [INFO] [default] - +=== 无监督异常检测验证 === +[2025-06-25 21:51:02,954] [INFO] [default] - Isolation Forest一致性: 0.9022 +[2025-06-25 21:51:02,986] [INFO] [default] - PCA重构误差一致性: 0.9023 +[2025-06-25 21:51:02,986] [INFO] [default] - +=== 错误分类分析 === +[2025-06-25 21:51:02,987] [INFO] [default] - 被标记为正常但重构误差高的样本: 72429 +[2025-06-25 21:51:02,987] [INFO] [default] - 被标记为异常但重构误差低的样本: 4035 +[2025-06-25 21:51:02,987] [INFO] [default] - +=== 特征级异常分析 === +[2025-06-25 21:51:03,005] [WARNING] [default] - 发现可疑特征(异常样本更接近均值): +[2025-06-25 21:51:03,005] [WARNING] [default] - aCosPhi_L2: 比值=1.70 +[2025-06-25 21:51:03,005] [WARNING] [default] - aCosPhi_L1: 比值=1.59 +[2025-06-25 21:51:03,005] [WARNING] [default] - aCosPhi_L3: 比值=1.49 +[2025-06-25 21:51:03,005] [WARNING] [default] - aCurrentL1: 比值=1.37 +[2025-06-25 21:51:03,005] [WARNING] [default] - airTempeatur: 比值=1.37 +[2025-06-25 21:51:03,005] [INFO] [default] - +=== 时序模式分析 === +[2025-06-25 21:51:03,010] [INFO] [default] - 正常样本平均变化率: 0.5972 +[2025-06-25 21:51:03,010] [INFO] [default] - 异常样本平均变化率: 0.5185 +[2025-06-25 21:51:03,010] [INFO] [default] - +=== 结论 === +[2025-06-25 21:51:03,010] [INFO] [default] - +建议: +[2025-06-25 21:51:03,010] [INFO] [default] - 1. 重新审视异常的定义和标注过程 +[2025-06-25 21:51:03,010] [INFO] [default] - 2. 考虑使用监督学习方法而非重构方法 +[2025-06-25 21:51:03,010] [INFO] [default] - 3. 分析原始时间序列数据而非静态特征 +[2025-06-25 21:51:03,010] [INFO] [default] - 4. 与数据提供方确认异常的具体含义 + + + +🚨 核心问题:异常样本实际上更"简单" +正常样本统计: + 方差总和: 1.0210 + +异常样本统计: + 方差总和: 0.5939 (明显更小!) + +这说明什么? +异常样本变化更小、更规律 +正常样本反而更复杂、变化更大 +这与传统的"异常=复杂"假设相反 +🎯 为什么TimesNet失败? +TimesNet基于重构误差的异常检测假设: +🔴 假设:异常样本难以重构 → 重构误差大 +🔴 现实:Contact的异常样本更简单 → 重构误差小 +所以TimesNet会错误地将: +❌ 复杂的正常样本判断为异常 +❌ 简单的异常样本判断为正常 \ No newline at end of file diff --git "a/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.py" "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.py" new file mode 100644 index 000000000..488a2c0d2 --- /dev/null +++ "b/data_preprocess/\346\225\260\346\215\256\345\210\206\346\236\220\346\212\245\345\221\212/verify_contact_anomaly_definition.py" @@ -0,0 +1,162 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import IsolationForest +from sklearn.decomposition import PCA +from utils.logger import logger + +def verify_contact_anomaly_definition(): + """验证Contact数据集的异常定义""" + logger.info("验证Contact异常定义...") + + # 1. 加载数据 + contact_test = pd.read_parquet("./dataset/split_data/test.parquet") + contact_labels = pd.read_parquet("./dataset/anomaly_labeld_testData/test_labeled.parquet") + + # 跳过TimeStamp列 + test_data = contact_test.iloc[:, 1:].values + test_labels = contact_labels.iloc[:, 1:].values.flatten() + + logger.info(f"测试数据形状: {test_data.shape}") + logger.info(f"异常样本数: {test_labels.sum()} / {len(test_labels)} ({test_labels.sum()/len(test_labels)*100:.2f}%)") + + # 2. 标准化数据 + scaler = StandardScaler() + test_data_scaled = scaler.fit_transform(test_data) + + # 3. 分析正常vs异常样本的统计特性 + normal_mask = test_labels == 0 + anomaly_mask = test_labels == 1 + + normal_data = test_data_scaled[normal_mask] + anomaly_data = test_data_scaled[anomaly_mask] + + logger.info(f"\n=== 统计特性对比 ===") + logger.info(f"正常样本统计:") + logger.info(f" 数量: {len(normal_data)}") + logger.info(f" 均值范围: [{normal_data.mean(axis=0).min():.4f}, {normal_data.mean(axis=0).max():.4f}]") + logger.info(f" 标准差范围: [{normal_data.std(axis=0).min():.4f}, {normal_data.std(axis=0).max():.4f}]") + logger.info(f" 方差总和: {normal_data.var().sum():.4f}") + + logger.info(f"\n异常样本统计:") + logger.info(f" 数量: {len(anomaly_data)}") + logger.info(f" 均值范围: [{anomaly_data.mean(axis=0).min():.4f}, {anomaly_data.mean(axis=0).max():.4f}]") + logger.info(f" 标准差范围: [{anomaly_data.std(axis=0).min():.4f}, {anomaly_data.std(axis=0).max():.4f}]") + logger.info(f" 方差总和: {anomaly_data.var().sum():.4f}") + + # 4. 使用无监督异常检测验证 + logger.info(f"\n=== 无监督异常检测验证 ===") + + # 4.1 Isolation Forest + iso_forest = IsolationForest(contamination=0.05, random_state=42) + iso_predictions = iso_forest.fit_predict(test_data_scaled) + iso_anomalies = (iso_predictions == -1) + + # 计算与真实标签的一致性 + iso_agreement = (iso_anomalies == (test_labels == 1)).mean() + logger.info(f"Isolation Forest一致性: {iso_agreement:.4f}") + + # 4.2 PCA重构误差 + pca = PCA(n_components=10) + pca.fit(test_data_scaled) + + reconstructed = pca.inverse_transform(pca.transform(test_data_scaled)) + reconstruction_errors = np.mean((test_data_scaled - reconstructed) ** 2, axis=1) + + # 按重构误差排序,前5%作为异常 + error_threshold = np.percentile(reconstruction_errors, 95) + pca_anomalies = reconstruction_errors > error_threshold + + pca_agreement = (pca_anomalies == (test_labels == 1)).mean() + logger.info(f"PCA重构误差一致性: {pca_agreement:.4f}") + + # 5. 分析错误分类的样本 + logger.info(f"\n=== 错误分类分析 ===") + + # 被标记为正常但重构误差高的样本 + false_normal = normal_mask & (reconstruction_errors > np.median(reconstruction_errors[anomaly_mask])) + false_normal_count = false_normal.sum() + + # 被标记为异常但重构误差低的样本 + false_anomaly = anomaly_mask & (reconstruction_errors < np.median(reconstruction_errors[normal_mask])) + false_anomaly_count = false_anomaly.sum() + + logger.info(f"被标记为正常但重构误差高的样本: {false_normal_count}") + logger.info(f"被标记为异常但重构误差低的样本: {false_anomaly_count}") + + # 6. 特征级别的异常分析 + logger.info(f"\n=== 特征级异常分析 ===") + feature_names = list(contact_test.columns[1:]) + + suspicious_features = [] + for i, feature_name in enumerate(feature_names): + normal_vals = test_data_scaled[normal_mask, i] + anomaly_vals = test_data_scaled[anomaly_mask, i] + + # 检查异常样本是否更接近0(标准化后) + normal_abs_mean = np.abs(normal_vals).mean() + anomaly_abs_mean = np.abs(anomaly_vals).mean() + + if anomaly_abs_mean < normal_abs_mean: + ratio = normal_abs_mean / (anomaly_abs_mean + 1e-8) + suspicious_features.append((feature_name, ratio)) + + suspicious_features.sort(key=lambda x: x[1], reverse=True) + + if suspicious_features: + logger.warning(f"发现可疑特征(异常样本更接近均值):") + for name, ratio in suspicious_features[:5]: + logger.warning(f" {name}: 比值={ratio:.2f}") + + # 7. 时间序列模式分析 + logger.info(f"\n=== 时序模式分析 ===") + + # 分析前1000个样本的时序特性 + sample_size = min(1000, len(test_data)) + sample_data = test_data_scaled[:sample_size] + sample_labels = test_labels[:sample_size] + + normal_indices = np.where(sample_labels == 0)[0] + anomaly_indices = np.where(sample_labels == 1)[0] + + if len(normal_indices) > 0 and len(anomaly_indices) > 0: + # 计算相邻时间点的变化率 + normal_changes = [] + anomaly_changes = [] + + for i in normal_indices[:-1]: + if i+1 in normal_indices: + change = np.linalg.norm(sample_data[i+1] - sample_data[i]) + normal_changes.append(change) + + for i in anomaly_indices[:-1]: + if i+1 in anomaly_indices: + change = np.linalg.norm(sample_data[i+1] - sample_data[i]) + anomaly_changes.append(change) + + if normal_changes and anomaly_changes: + logger.info(f"正常样本平均变化率: {np.mean(normal_changes):.4f}") + logger.info(f"异常样本平均变化率: {np.mean(anomaly_changes):.4f}") + + # 8. 结论和建议 + logger.info(f"\n=== 结论 ===") + + if pca_agreement < 0.3: + logger.error("❌ PCA重构误差与标签严重不一致!") + logger.error("可能原因:") + logger.error(" 1. 异常标签定义与重构复杂度无关") + logger.error(" 2. 数据预处理丢失了关键信息") + logger.error(" 3. 异常可能是基于其他维度定义的(如时间、业务逻辑)") + + if iso_agreement < 0.3: + logger.error("❌ Isolation Forest与标签也不一致!") + logger.error("这进一步确认了标签定义的问题") + + logger.info(f"\n建议:") + logger.info(f" 1. 重新审视异常的定义和标注过程") + logger.info(f" 2. 考虑使用监督学习方法而非重构方法") + logger.info(f" 3. 分析原始时间序列数据而非静态特征") + logger.info(f" 4. 与数据提供方确认异常的具体含义") + +if __name__ == "__main__": + verify_contact_anomaly_definition() \ No newline at end of file diff --git a/data_provider/data_factory.py b/data_provider/data_factory.py index 7fc458f69..a2a9d9067 100644 --- a/data_provider/data_factory.py +++ b/data_provider/data_factory.py @@ -1,5 +1,5 @@ from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom, Dataset_M4, PSMSegLoader, \ - MSLSegLoader, SMAPSegLoader, SMDSegLoader, SWATSegLoader, UEAloader + MSLSegLoader, SMAPSegLoader, SMDSegLoader, SWATSegLoader, UEAloader, ContactLoader, SupervisedContactLoader from data_provider.uea import collate_fn from torch.utils.data import DataLoader @@ -15,7 +15,9 @@ 'SMAP': SMAPSegLoader, 'SMD': SMDSegLoader, 'SWAT': SWATSegLoader, - 'UEA': UEAloader + 'UEA': UEAloader, + 'Contact': ContactLoader, + 'SupervisedContact': SupervisedContactLoader } @@ -44,6 +46,23 @@ def data_provider(args, flag): num_workers=args.num_workers, drop_last=drop_last) return data_set, data_loader + elif args.task_name == 'supervised_anomaly_detection': + drop_last = False + data_set = Data( + args = args, + root_path=args.root_path, + win_size=args.seq_len, + flag=flag, + ) + print(flag, len(data_set)) + data_loader = DataLoader( + data_set, + batch_size=batch_size, + shuffle=shuffle_flag, + num_workers=args.num_workers, + drop_last=drop_last) + return data_set, data_loader + elif args.task_name == 'classification': drop_last = False data_set = Data( diff --git a/data_provider/data_loader.py b/data_provider/data_loader.py index dcbea3124..04b9e9245 100644 --- a/data_provider/data_loader.py +++ b/data_provider/data_loader.py @@ -433,6 +433,71 @@ def __getitem__(self, index): self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) +class ContactLoader(Dataset): + def __init__(self, args, root_path, win_size, step=1, flag="train"): + self.flag = flag + self.step = step + self.win_size = win_size + self.scaler = StandardScaler() + + # Load training data + data = pd.read_parquet(os.path.join(root_path, 'feature_filtered_data/train_normal_filtered.parquet')) + # Check if you need to skip first column - adjust this based on your data structure + # If your data doesn't have an index column to skip, remove the slicing + data = data.values[:, 1:] if data.shape[1] > 1 else data.values + data = np.nan_to_num(data) + + # Fit scaler and transform training data + self.scaler.fit(data) + data = self.scaler.transform(data) + + # Load test data + test_data = pd.read_parquet(os.path.join(root_path, 'feature_filtered_data/test_filtered.parquet')) + # Adjust slicing based on your data structure + test_data = test_data.values[:, 1:] + test_data = np.nan_to_num(test_data) + self.test = self.scaler.transform(test_data) + + # Set training data and create validation split + self.train = data + data_len = len(self.train) + self.val = self.train[(int)(data_len * 0.8):] + + # Load test labels + test_labels_df = pd.read_parquet(os.path.join(root_path, 'anomaly_labeld_testData/test_labeled.parquet')) + # Adjust slicing based on your data structure + self.test_labels = test_labels_df.values[:, 1:] + + print("test:", self.test.shape) + print("train:", self.train.shape) + def __len__(self): + if self.flag == "train": + return (self.train.shape[0] - self.win_size) // self.step + 1 + elif (self.flag == 'val'): + return (self.val.shape[0] - self.win_size) // self.step + 1 + elif (self.flag == 'test'): + return (self.test.shape[0] - self.win_size) // self.step + 1 + else: + return (self.test.shape[0] - self.win_size) // self.win_size + 1 + + def __getitem__(self, index): + index = index * self.step + if self.flag == "train": + return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) + elif (self.flag == 'val'): + return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) + elif (self.flag == 'test'): + return np.float32(self.test[index:index + self.win_size]), np.float32( + self.test_labels[index:index + self.win_size]) + else: + return np.float32(self.test[ + index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( + self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) + + + + + class MSLSegLoader(Dataset): def __init__(self, args, root_path, win_size, step=1, flag="train"): self.flag = flag @@ -746,3 +811,115 @@ def __getitem__(self, ind): def __len__(self): return len(self.all_IDs) + + +class SupervisedContactLoader(Dataset): + """ + Data loader for supervised Contact anomaly detection + Loads labeled train/test data from parquet files + """ + def __init__(self, args, root_path, win_size, step=1, flag="train"): + self.flag = flag + self.step = step + self.win_size = win_size + self.scaler = StandardScaler() + self.args = args + + from utils.logger import logger + logger.info(f"🔄 初始化监督Contact数据加载器,flag={flag}, win_size={win_size}") + + if flag.lower() in ['train', 'training']: + # 加载训练数据 + train_path = os.path.join(root_path, 'supervised/lable/train_labeled.parquet') + logger.info(f"📂 加载训练数据: {train_path}") + + train_df = pd.read_parquet(train_path) + logger.info(f"训练数据shape: {train_df.shape}") + + # 提取特征和标签 + feature_columns = [col for col in train_df.columns if col not in ['TimeStamp', 'label']] + self.data = train_df[feature_columns].values.astype(np.float32) + self.labels = train_df['label'].values.astype(np.int32) + + # 数据标准化 + self.data = np.nan_to_num(self.data) + self.scaler.fit(self.data) + self.data = self.scaler.transform(self.data) + + # 训练验证分割 (80/20) + data_len = len(self.data) + split_idx = int(data_len * 0.8) + self.train_data = self.data[:split_idx] + self.train_labels = self.labels[:split_idx] + self.val_data = self.data[split_idx:] + self.val_labels = self.labels[split_idx:] + + logger.info(f"训练集: {self.train_data.shape}, 正常样本: {np.sum(self.train_labels==0)}, 异常样本: {np.sum(self.train_labels==1)}") + logger.info(f"验证集: {self.val_data.shape}, 正常样本: {np.sum(self.val_labels==0)}, 异常样本: {np.sum(self.val_labels==1)}") + + elif flag.lower() in ['val', 'validation']: + # 验证数据从训练集分割得到,需要先初始化训练模式 + temp_loader = SupervisedContactLoader(args, root_path, win_size, step, 'train') + self.scaler = temp_loader.scaler + self.data = temp_loader.val_data + self.labels = temp_loader.val_labels + logger.info(f"使用验证数据: {self.data.shape}") + + elif flag.lower() in ['test', 'testing']: + # 加载测试数据 + test_path = os.path.join(root_path, 'supervised/lable/test_labeled.parquet') + train_path = os.path.join(root_path, 'supervised/lable/train_labeled.parquet') + + logger.info(f"📂 加载测试数据: {test_path}") + + # 先加载训练数据来拟合标准化器 + train_df = pd.read_parquet(train_path) + feature_columns = [col for col in train_df.columns if col not in ['TimeStamp', 'label']] + train_data = train_df[feature_columns].values.astype(np.float32) + train_data = np.nan_to_num(train_data) + self.scaler.fit(train_data) + + # 加载并处理测试数据 + test_df = pd.read_parquet(test_path) + logger.info(f"测试数据shape: {test_df.shape}") + + self.data = test_df[feature_columns].values.astype(np.float32) + self.labels = test_df['label'].values.astype(np.int32) + + # 使用训练集的标准化器 + self.data = np.nan_to_num(self.data) + self.data = self.scaler.transform(self.data) + + logger.info(f"测试集: {self.data.shape}, 正常样本: {np.sum(self.labels==0)}, 异常样本: {np.sum(self.labels==1)}") + + self.num_features = self.data.shape[1] + logger.info(f"特征维度: {self.num_features}") + + def __len__(self): + if self.flag.lower() in ['train', 'training']: + return max(0, (self.train_data.shape[0] - self.win_size) // self.step + 1) + elif self.flag.lower() in ['val', 'validation']: + return max(0, (self.data.shape[0] - self.win_size) // self.step + 1) + else: # test + return max(0, (self.data.shape[0] - self.win_size) // self.step + 1) + + def __getitem__(self, index): + index = index * self.step + + if self.flag.lower() in ['train', 'training']: + # 训练数据 + seq = self.train_data[index:index + self.win_size] + # 逐点标签:返回每个时间点的标签 + window_labels = self.train_labels[index:index + self.win_size] + + elif self.flag.lower() in ['val', 'validation']: + # 验证数据 + seq = self.data[index:index + self.win_size] + window_labels = self.labels[index:index + self.win_size] + + else: # test + # 测试数据 + seq = self.data[index:index + self.win_size] + window_labels = self.labels[index:index + self.win_size] + + return np.float32(seq), np.int64(window_labels) # 返回每个时间点的标签 diff --git a/exp/exp_anomaly_detection.py b/exp/exp_anomaly_detection.py index dfddc2306..9ebf7a8ef 100644 --- a/exp/exp_anomaly_detection.py +++ b/exp/exp_anomaly_detection.py @@ -3,6 +3,7 @@ from utils.tools import EarlyStopping, adjust_learning_rate, adjustment from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score +from sklearn.metrics import precision_recall_curve, auc import torch.multiprocessing torch.multiprocessing.set_sharing_strategy('file_system') @@ -192,15 +193,20 @@ def test(self, setting, test=0): accuracy = accuracy_score(gt, pred) precision, recall, f_score, support = precision_recall_fscore_support(gt, pred, average='binary') - print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format( + + # Calculate PRAUC using continuous anomaly scores (test_energy) + precision_curve, recall_curve, _ = precision_recall_curve(gt, test_energy) + prauc = auc(recall_curve, precision_curve) + + print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f}, PRAUC : {:0.4f} ".format( accuracy, precision, - recall, f_score)) + recall, f_score, prauc)) f = open("result_anomaly_detection.txt", 'a') f.write(setting + " \n") - f.write("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format( + f.write("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f}, PRAUC : {:0.4f} ".format( accuracy, precision, - recall, f_score)) + recall, f_score, prauc)) f.write('\n') f.write('\n') f.close() diff --git a/exp/exp_anomaly_detection_contact_supervised.py b/exp/exp_anomaly_detection_contact_supervised.py new file mode 100644 index 000000000..f759201c2 --- /dev/null +++ b/exp/exp_anomaly_detection_contact_supervised.py @@ -0,0 +1,276 @@ +from data_provider.data_factory import data_provider +from exp.exp_basic import Exp_Basic +from utils.tools import EarlyStopping, adjust_learning_rate +from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score +import torch.multiprocessing +torch.multiprocessing.set_sharing_strategy('file_system') +import torch +import torch.nn as nn +from torch import optim +import os +import time +import warnings +import numpy as np +from utils.logger import logger + +warnings.filterwarnings('ignore') + + +class Exp_Anomaly_Detection_Contact_Supervised(Exp_Basic): + def __init__(self, args): + super(Exp_Anomaly_Detection_Contact_Supervised, self).__init__(args) + + def _build_model(self): + # 使用TimesNet作为特征提取器,然后添加分类头 + model = self.model_dict[self.args.model].Model(self.args).float() + + # 添加分类层 + feature_dim = self.args.d_model * self.args.seq_len # TimesNet输出维度 + + # 添加分类头:特征提取 -> 全连接 -> 二分类 + self.classifier = nn.Sequential( + nn.AdaptiveAvgPool1d(1), # 全局平均池化 + nn.Flatten(), + nn.Linear(self.args.enc_in, 128), # enc_in是特征维度 + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(128, 64), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(64, 2) # 二分类:正常/异常 + ).float().to(self.device) + + if self.args.use_multi_gpu and self.args.use_gpu: + model = nn.DataParallel(model, device_ids=self.args.device_ids) + self.classifier = nn.DataParallel(self.classifier, device_ids=self.args.device_ids) + + return model + + def _get_data(self, flag): + data_set, data_loader = data_provider(self.args, flag) + return data_set, data_loader + + def _select_optimizer(self): + # 同时优化特征提取器和分类器 + params = list(self.model.parameters()) + list(self.classifier.parameters()) + model_optim = optim.Adam(params, lr=self.args.learning_rate) + return model_optim + + def _select_criterion(self): + # 使用交叉熵损失进行分类 + criterion = nn.CrossEntropyLoss() + return criterion + + def vali(self, vali_data, vali_loader, criterion): + total_loss = [] + self.model.eval() + self.classifier.eval() + + with torch.no_grad(): + for i, (batch_x, batch_y) in enumerate(vali_loader): + batch_x = batch_x.float().to(self.device) + + # 处理标签 + if len(batch_y.shape) == 2: + # 如果窗口中有任何异常点,则窗口标记为异常 + batch_labels = torch.max(batch_y, dim=1)[0].long().to(self.device) + else: + batch_labels = batch_y.long().to(self.device) + + # 特征提取 + features = self.model(batch_x, None, None, None) # [batch, seq_len, features] + + # 全局平均池化用于分类 + pooled_features = torch.mean(features, dim=1) # [batch, features] + + # 分类 + logits = self.classifier(pooled_features) + + loss = criterion(logits, batch_labels) + total_loss.append(loss.item()) + + total_loss = np.average(total_loss) + self.model.train() + self.classifier.train() + return total_loss + + def train(self, setting): + train_data, train_loader = self._get_data(flag='train') + vali_data, vali_loader = self._get_data(flag='val') + test_data, test_loader = self._get_data(flag='test') + + path = os.path.join(self.args.checkpoints, setting) + if not os.path.exists(path): + os.makedirs(path) + + time_now = time.time() + train_steps = len(train_loader) + early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) + + model_optim = self._select_optimizer() + criterion = self._select_criterion() + + logger.info("🎯 开始监督学习训练...") + + for epoch in range(self.args.train_epochs): + iter_count = 0 + train_loss = [] + + self.model.train() + self.classifier.train() + epoch_time = time.time() + + for i, (batch_x, batch_y) in enumerate(train_loader): + iter_count += 1 + model_optim.zero_grad() + + batch_x = batch_x.float().to(self.device) + + # 处理标签 + if len(batch_y.shape) == 2: + # 如果窗口中有任何异常点,则窗口标记为异常 + batch_labels = torch.max(batch_y, dim=1)[0].long().to(self.device) + else: + batch_labels = batch_y.long().to(self.device) + + # 特征提取 + features = self.model(batch_x, None, None, None) # [batch, seq_len, features] + + # 全局平均池化用于分类 + pooled_features = torch.mean(features, dim=1) # [batch, features] + + # 分类 + logits = self.classifier(pooled_features) + + loss = criterion(logits, batch_labels) + train_loss.append(loss.item()) + + if (i + 1) % 100 == 0: + logger.info("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())) + speed = (time.time() - time_now) / iter_count + left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i) + logger.info('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) + iter_count = 0 + time_now = time.time() + + loss.backward() + model_optim.step() + + logger.info("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) + train_loss = np.average(train_loss) + vali_loss = self.vali(vali_data, vali_loader, criterion) + test_loss = self.vali(test_data, test_loader, criterion) + + logger.info("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( + epoch + 1, train_steps, train_loss, vali_loss, test_loss)) + + early_stopping(vali_loss, self.model, path) + if early_stopping.early_stop: + logger.info("Early stopping") + break + + adjust_learning_rate(model_optim, epoch + 1, self.args) + + # 保存分类器 + best_model_path = path + '/' + 'checkpoint.pth' + classifier_path = path + '/' + 'classifier.pth' + + self.model.load_state_dict(torch.load(best_model_path)) + torch.save(self.classifier.state_dict(), classifier_path) + + return self.model + + def test(self, setting, test=0): + test_data, test_loader = self._get_data(flag='test') + + if test: + logger.info('loading model') + model_path = os.path.join('./checkpoints/' + setting, 'checkpoint.pth') + classifier_path = os.path.join('./checkpoints/' + setting, 'classifier.pth') + + self.model.load_state_dict(torch.load(model_path)) + self.classifier.load_state_dict(torch.load(classifier_path)) + + folder_path = './test_results/' + setting + '/' + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + self.model.eval() + self.classifier.eval() + + # 收集预测结果和真实标签 + all_predictions = [] + all_probabilities = [] + all_labels = [] + + logger.info("🔍 开始监督学习测试...") + + with torch.no_grad(): + for i, (batch_x, batch_y) in enumerate(test_loader): + batch_x = batch_x.float().to(self.device) + + # 处理标签 + if len(batch_y.shape) == 2: + batch_labels = torch.max(batch_y, dim=1)[0].numpy() + else: + batch_labels = batch_y.numpy() + + # 特征提取 + features = self.model(batch_x, None, None, None) # [batch, seq_len, features] + + # 全局平均池化用于分类 + pooled_features = torch.mean(features, dim=1) # [batch, features] + + # 分类 + logits = self.classifier(pooled_features) + probabilities = torch.softmax(logits, dim=1) + predictions = torch.argmax(logits, dim=1) + + all_predictions.extend(predictions.cpu().numpy()) + all_probabilities.extend(probabilities[:, 1].cpu().numpy()) # 异常类概率 + all_labels.extend(batch_labels) + + all_predictions = np.array(all_predictions) + all_probabilities = np.array(all_probabilities) + all_labels = np.array(all_labels).astype(int) + + # 计算评估指标 + precision, recall, f_score, support = precision_recall_fscore_support( + all_labels, all_predictions, average='binary' + ) + accuracy = accuracy_score(all_labels, all_predictions) + + try: + auc_score = roc_auc_score(all_labels, all_probabilities) + except: + auc_score = 0.0 + + results = { + 'precision': precision, + 'recall': recall, + 'f_score': f_score, + 'accuracy': accuracy, + 'auc': auc_score + } + + logger.info("📊 监督学习结果:") + logger.info(f" Precision: {precision:.4f}") + logger.info(f" Recall: {recall:.4f}") + logger.info(f" F1-Score: {f_score:.4f}") + logger.info(f" Accuracy: {accuracy:.4f}") + logger.info(f" AUC: {auc_score:.4f}") + + # 分析预测分布 + unique_labels, label_counts = np.unique(all_labels, return_counts=True) + unique_preds, pred_counts = np.unique(all_predictions, return_counts=True) + + logger.info(f" 真实标签分布: {dict(zip(unique_labels, label_counts))}") + logger.info(f" 预测标签分布: {dict(zip(unique_preds, pred_counts))}") + + # 保存结果 + np.save(folder_path + 'metrics.npy', results) + np.save(folder_path + 'predictions.npy', all_predictions) + np.save(folder_path + 'probabilities.npy', all_probabilities) + np.save(folder_path + 'true_labels.npy', all_labels) + + return results \ No newline at end of file diff --git a/exp/exp_supervised_anomaly_detection.py b/exp/exp_supervised_anomaly_detection.py new file mode 100644 index 000000000..9ce4084cc --- /dev/null +++ b/exp/exp_supervised_anomaly_detection.py @@ -0,0 +1,365 @@ +from data_provider.data_factory import data_provider +from exp.exp_basic import Exp_Basic +from utils.tools import EarlyStopping, adjust_learning_rate +from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_recall_curve, auc, confusion_matrix, classification_report +from utils.logger import logger +import torch.multiprocessing +import torch +import torch.nn as nn +from torch import optim +import os +import time +import warnings +import numpy as np +from tqdm import tqdm + +torch.multiprocessing.set_sharing_strategy('file_system') +warnings.filterwarnings('ignore') + + +class Exp_Supervised_Anomaly_Detection(Exp_Basic): + def __init__(self, args): + super(Exp_Supervised_Anomaly_Detection, self).__init__(args) + + def _build_model(self): + # 获取数据信息来配置模型 + train_data, train_loader = self._get_data(flag='train') + + # 设置模型参数 + self.args.enc_in = train_data.num_features # 特征维度 + self.args.pred_len = 0 # 不需要预测长度 + self.args.output_attention = False + + # 创建TimesNet模型 + model = self.model_dict[self.args.model].Model(self.args).float() + + # 添加二分类头 + if hasattr(model, 'd_model'): + hidden_dim = model.d_model + else: + hidden_dim = self.args.d_model + + # 创建改进的分类头 - 更深的网络 + self.classification_head = nn.Sequential( + nn.Dropout(0.4), + nn.Linear(hidden_dim, hidden_dim * 2), # 扩展维度 + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(hidden_dim * 2, hidden_dim), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), + nn.Linear(hidden_dim // 2, 1) # 每个时间步的二分类输出 + ).float() + + if self.args.use_multi_gpu and self.args.use_gpu: + model = nn.DataParallel(model, device_ids=self.args.device_ids) + self.classification_head = nn.DataParallel(self.classification_head, device_ids=self.args.device_ids) + + # 将分类头移动到合适的设备 + if self.args.use_gpu: + self.classification_head = self.classification_head.cuda() + + return model + + def _get_data(self, flag): + data_set, data_loader = data_provider(self.args, flag) + return data_set, data_loader + + def _select_optimizer(self): + # 联合优化TimesNet和分类头 + params = list(self.model.parameters()) + list(self.classification_head.parameters()) + model_optim = optim.Adam(params, lr=self.args.learning_rate, weight_decay=1e-5) + return model_optim + + def _select_criterion(self): + # 使用更强的权重处理类别不平衡 + # 增加异常样本权重到20倍 + pos_weight = torch.tensor([20.0]) # 大幅增加异常样本权重 + if self.args.use_gpu: + pos_weight = pos_weight.cuda() + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + return criterion + + def _extract_features(self, batch_x): + """使用TimesNet提取特征""" + # 手动进行TimesNet的特征提取,类似于anomaly_detection方法 + # 但不做最终的projection,而是返回编码后的特征 + + # Normalization from Non-stationary Transformer + means = batch_x.mean(1, keepdim=True).detach() + x_enc = batch_x.sub(means) + stdev = torch.sqrt( + torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5) + x_enc = x_enc.div(stdev) + + # embedding + enc_out = self.model.enc_embedding(x_enc, None) # [B,T,C] + + # TimesNet encoding layers + for i in range(self.model.layer): + enc_out = self.model.layer_norm(self.model.model[i](enc_out)) + + # 返回编码特征而不是最终输出 + return enc_out # [B, T, d_model] + + def vali(self, vali_data, vali_loader, criterion): + total_loss = [] + preds = [] + trues = [] + + self.model.eval() + self.classification_head.eval() + + with torch.no_grad(): + for i, (batch_x, batch_y) in enumerate(vali_loader): + batch_x = batch_x.float().to(self.device) + batch_y = batch_y.long().to(self.device) # [batch_size, seq_len] + + # 提取TimesNet特征 + features = self._extract_features(batch_x) # [batch_size, seq_len, features] + + # 通过分类头得到每个时间步的预测 + outputs = self.classification_head(features) # [batch_size, seq_len, 1] + outputs = outputs.squeeze(-1) # [batch_size, seq_len] + + # 计算损失 - 逐点损失 + loss = criterion(outputs.view(-1), batch_y.view(-1).float()) + total_loss.append(loss.item()) + + # 收集预测和真实标签 + probs = torch.sigmoid(outputs) # [batch_size, seq_len] + preds.append(probs.cpu().view(-1)) # 展平为1D + trues.append(batch_y.cpu().view(-1)) # 展平为1D + + total_loss = np.average(total_loss) + + # 计算指标 + preds = torch.cat(preds, 0).numpy() + trues = torch.cat(trues, 0).numpy() + + # 使用0.5作为阈值 + predictions = (preds > 0.5).astype(int) + accuracy = accuracy_score(trues, predictions) + + self.model.train() + self.classification_head.train() + + return total_loss, accuracy, preds, trues + + def train(self, setting): + train_data, train_loader = self._get_data(flag='train') + vali_data, vali_loader = self._get_data(flag='val') + test_data, test_loader = self._get_data(flag='test') + + path = os.path.join(self.args.checkpoints, setting) + if not os.path.exists(path): + os.makedirs(path) + + time_now = time.time() + train_steps = len(train_loader) + early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) + + model_optim = self._select_optimizer() + criterion = self._select_criterion() + + logger.info(f"🚀 开始监督异常检测训练,设置: {setting}") + logger.info(f"📊 数据统计 - 训练集: {len(train_data)}, 验证集: {len(vali_data)}, 测试集: {len(test_data)}") + + for epoch in range(self.args.train_epochs): + iter_count = 0 + train_loss = [] + + self.model.train() + self.classification_head.train() + epoch_time = time.time() + + # 使用tqdm显示进度 + pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{self.args.train_epochs}') + + for i, (batch_x, batch_y) in enumerate(pbar): + iter_count += 1 + model_optim.zero_grad() + + batch_x = batch_x.float().to(self.device) + batch_y = batch_y.long().to(self.device) # [batch_size, seq_len] + + # 提取TimesNet特征 + features = self._extract_features(batch_x) # [batch_size, seq_len, features] + + # 通过分类头得到每个时间步的预测 + outputs = self.classification_head(features) # [batch_size, seq_len, 1] + outputs = outputs.squeeze(-1) # [batch_size, seq_len] + + # 计算损失 - 逐点损失 + loss = criterion(outputs.view(-1), batch_y.view(-1).float()) + train_loss.append(loss.item()) + + # 更新进度条 + pbar.set_postfix({'loss': f'{loss.item():.4f}'}) + + if (i + 1) % 100 == 0: + logger.info(f"\titers: {i+1}, epoch: {epoch+1} | loss: {loss.item():.7f}") + speed = (time.time() - time_now) / iter_count + left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i) + logger.info(f'\tspeed: {speed:.4f}s/iter; left time: {left_time:.4f}s') + iter_count = 0 + time_now = time.time() + + loss.backward() + # 梯度裁剪 + nn.utils.clip_grad_norm_(list(self.model.parameters()) + list(self.classification_head.parameters()), max_norm=1.0) + model_optim.step() + + logger.info(f"Epoch: {epoch+1} cost time: {time.time() - epoch_time:.2f}s") + + train_loss = np.average(train_loss) + vali_loss, val_accuracy, val_preds, val_trues = self.vali(vali_data, vali_loader, criterion) + test_loss, test_accuracy, test_preds, test_trues = self.vali(test_data, test_loader, criterion) + + # 计算详细指标 + val_predictions = (val_preds > 0.5).astype(int) + val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_trues, val_predictions, average='binary') + + logger.info(f"Epoch: {epoch+1}, Steps: {train_steps}") + logger.info(f"Train Loss: {train_loss:.7f}") + logger.info(f"Val Loss: {vali_loss:.7f}, Acc: {val_accuracy:.4f}, P: {val_precision:.4f}, R: {val_recall:.4f}, F1: {val_f1:.4f}") + logger.info(f"Test Loss: {test_loss:.7f}, Acc: {test_accuracy:.4f}") + + # 使用F1分数进行早停 + early_stopping(-val_f1, self.model, path) + if early_stopping.early_stop: + logger.info("Early stopping") + break + + adjust_learning_rate(model_optim, epoch + 1, self.args) + + # 加载最佳模型 + best_model_path = path + '/' + 'checkpoint.pth' + self.model.load_state_dict(torch.load(best_model_path)) + + return self.model + + def test(self, setting, test=0): + test_data, test_loader = self._get_data(flag='test') + + if test: + logger.info('Loading model checkpoint...') + checkpoint_path = os.path.join('./checkpoints/' + setting, 'checkpoint.pth') + self.model.load_state_dict(torch.load(checkpoint_path)) + + folder_path = './test_results/' + setting + '/' + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + logger.info("🧪 开始监督异常检测测试...") + + self.model.eval() + self.classification_head.eval() + + preds = [] + trues = [] + + with torch.no_grad(): + for i, (batch_x, batch_y) in enumerate(test_loader): + batch_x = batch_x.float().to(self.device) + batch_y = batch_y.long().to(self.device) # [batch_size, seq_len] + + # 提取TimesNet特征 + features = self._extract_features(batch_x) # [batch_size, seq_len, features] + + # 通过分类头得到每个时间步的预测 + outputs = self.classification_head(features) # [batch_size, seq_len, 1] + outputs = outputs.squeeze(-1) # [batch_size, seq_len] + + # 计算概率 + probs = torch.sigmoid(outputs) # [batch_size, seq_len] + + preds.append(probs.cpu().numpy().flatten()) # 展平为1D + trues.append(batch_y.cpu().numpy().flatten()) # 展平为1D + + preds = np.concatenate(preds, axis=0) + trues = np.concatenate(trues, axis=0) + + logger.info(f'Test shape: preds={preds.shape}, trues={trues.shape}') + + # 评估不同阈值 + thresholds = [0.3, 0.4, 0.5, 0.6, 0.7] + best_f1 = 0 + best_threshold = 0.5 + best_results = {} + + for threshold in thresholds: + predictions = (preds > threshold).astype(int) + + accuracy = accuracy_score(trues, predictions) + precision, recall, f_score, _ = precision_recall_fscore_support(trues, predictions, average='binary') + + logger.info(f"阈值 {threshold:.1f}: Acc={accuracy:.4f}, P={precision:.4f}, R={recall:.4f}, F1={f_score:.4f}") + + if f_score > best_f1: + best_f1 = f_score + best_threshold = threshold + best_results = { + 'threshold': threshold, + 'accuracy': accuracy, + 'precision': precision, + 'recall': recall, + 'f1': f_score + } + + # 使用最佳阈值的预测结果 + best_predictions = (preds > best_threshold).astype(int) + + # 计算PRAUC + precision_curve, recall_curve, _ = precision_recall_curve(trues, preds) + prauc = auc(recall_curve, precision_curve) + + # 计算混淆矩阵 + cm = confusion_matrix(trues, best_predictions) + + logger.info(f"\n🎯 最佳结果 (阈值={best_threshold:.1f}):") + logger.info(f"Accuracy: {best_results['accuracy']:.4f}") + logger.info(f"Precision: {best_results['precision']:.4f}") + logger.info(f"Recall: {best_results['recall']:.4f}") + logger.info(f"F1-Score: {best_results['f1']:.4f}") + logger.info(f"PRAUC: {prauc:.4f}") + logger.info(f"\n混淆矩阵:\n{cm}") + + # 分类报告 + report = classification_report(trues, best_predictions, target_names=['Normal', 'Anomaly']) + logger.info(f"\n分类报告:\n{report}") + + # 保存结果 + result_file = f'./results/{setting}/' + if not os.path.exists(result_file): + os.makedirs(result_file) + + result_path = os.path.join(result_file, 'result_supervised_anomaly_detection.txt') + with open(result_path, 'w') as f: + f.write(f"Setting: {setting}\n") + f.write(f"Best Threshold: {best_threshold:.1f}\n") + f.write(f"Accuracy: {best_results['accuracy']:.4f}\n") + f.write(f"Precision: {best_results['precision']:.4f}\n") + f.write(f"Recall: {best_results['recall']:.4f}\n") + f.write(f"F1-Score: {best_results['f1']:.4f}\n") + f.write(f"PRAUC: {prauc:.4f}\n") + f.write(f"\nConfusion Matrix:\n{cm}\n") + f.write(f"\nClassification Report:\n{report}\n") + + # 同时保存到全局结果文件 + global_result_file = "result_supervised_anomaly_detection.txt" + with open(global_result_file, 'a') as f: + f.write(f"\n{'='*50}\n") + f.write(f"Setting: {setting}\n") + f.write(f"Accuracy: {best_results['accuracy']:.4f}, ") + f.write(f"Precision: {best_results['precision']:.4f}, ") + f.write(f"Recall: {best_results['recall']:.4f}, ") + f.write(f"F1-Score: {best_results['f1']:.4f}, ") + f.write(f"PRAUC: {prauc:.4f}\n") + f.write(f"Best Threshold: {best_threshold:.1f}\n") + + logger.info(f"📄 结果已保存到: {result_path}") + + return best_results \ No newline at end of file diff --git a/experiments/intermediate_results/correlation_matrix.parquet b/experiments/intermediate_results/correlation_matrix.parquet new file mode 100644 index 000000000..4ce304085 Binary files /dev/null and b/experiments/intermediate_results/correlation_matrix.parquet differ diff --git a/experiments/intermediate_results/features_to_remove.txt b/experiments/intermediate_results/features_to_remove.txt new file mode 100644 index 000000000..eeaa0f041 --- /dev/null +++ b/experiments/intermediate_results/features_to_remove.txt @@ -0,0 +1,10 @@ +aActivePower_L3 +aCurrentL1 +aCurrentL2 +aCurrentL3 +aReactivePower_L1 +aReactivePower_L2 +aReactivePower_L3 +aVoltage_L2_N +aVoltage_L3_N +airTotal diff --git a/requirements.txt b/requirements.txt index 9a20c2472..8cfef8bac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ sktime==0.16.1 sympy==1.11.1 torch==1.7.1 tqdm==4.64.1 -PyWavelets \ No newline at end of file +PyWavelets +pyarrow \ No newline at end of file diff --git a/result_supervised_anomaly_detection.txt b/result_supervised_anomaly_detection.txt new file mode 100644 index 000000000..27fab67b4 --- /dev/null +++ b/result_supervised_anomaly_detection.txt @@ -0,0 +1,10 @@ + +================================================== +Setting: supervised_anomaly_detection_SupervisedContact_TimesNet_SupervisedContact_ftM_sl40_ll48_pl0_dm64_nh8_el2_dl1_df64_expand2_dc4_fc1_ebtimeF_dtTrue_TimesNet_Supervised_Contact_Anomaly_Detection_0 +Accuracy: 0.8722, Precision: 0.1673, Recall: 0.3726, F1-Score: 0.2309, PRAUC: 0.1837 +Best Threshold: 0.7 + +================================================== +Setting: supervised_anomaly_detection_SupervisedContact_TimesNet_SupervisedContact_ftM_sl40_ll48_pl0_dm128_nh8_el2_dl1_df128_expand2_dc4_fc1_ebtimeF_dtTrue_TimesNet_Supervised_Contact_Anomaly_Detection_0 +Accuracy: 0.8565, Precision: 0.1186, Recall: 0.2780, F1-Score: 0.1663, PRAUC: 0.0852 +Best Threshold: 0.7 diff --git a/run.py b/run.py index 793a53ac2..9db3d6bea 100644 --- a/run.py +++ b/run.py @@ -7,6 +7,7 @@ from exp.exp_short_term_forecasting import Exp_Short_Term_Forecast from exp.exp_anomaly_detection import Exp_Anomaly_Detection from exp.exp_classification import Exp_Classification +from exp.exp_supervised_anomaly_detection import Exp_Supervised_Anomaly_Detection from utils.print_args import print_args import random import numpy as np @@ -21,7 +22,7 @@ # basic config parser.add_argument('--task_name', type=str, required=True, default='long_term_forecast', - help='task name, options:[long_term_forecast, short_term_forecast, imputation, classification, anomaly_detection]') + help='task name, options:[long_term_forecast, short_term_forecast, imputation, classification, anomaly_detection, supervised_anomaly_detection]') parser.add_argument('--is_training', type=int, required=True, default=1, help='status') parser.add_argument('--model_id', type=str, required=True, default='test', help='model id') parser.add_argument('--model', type=str, required=True, default='Autoformer', @@ -170,6 +171,10 @@ Exp = Exp_Anomaly_Detection elif args.task_name == 'classification': Exp = Exp_Classification + elif args.task_name == 'supervised_anomaly_detection': + Exp = Exp_Supervised_Anomaly_Detection + elif args.task_name == 'improved_supervised_anomaly_detection': + Exp = Exp_Improved_Supervised_Anomaly_Detection else: Exp = Exp_Long_Term_Forecast diff --git a/scripts/anomaly_detection/Contact/TimesNet.sh b/scripts/anomaly_detection/Contact/TimesNet.sh new file mode 100755 index 000000000..a7b676c41 --- /dev/null +++ b/scripts/anomaly_detection/Contact/TimesNet.sh @@ -0,0 +1,21 @@ +export CUDA_VISIBLE_DEVICES=0 + +python -u run.py \ + --task_name anomaly_detection \ + --is_training 1 \ + --root_path ./dataset \ + --model_id Contact \ + --model TimesNet \ + --data Contact \ + --features M \ + --seq_len 40 \ + --pred_len 0 \ + --d_model 64 \ + --d_ff 64 \ + --e_layers 2 \ + --enc_in 17 \ + --c_out 17 \ + --top_k 3 \ + --anomaly_ratio 5.2 \ + --batch_size 64 \ + --train_epochs 20 \ No newline at end of file diff --git a/scripts/anomaly_detection/Contact/TimesNet_Supervised.sh b/scripts/anomaly_detection/Contact/TimesNet_Supervised.sh new file mode 100755 index 000000000..951afba36 --- /dev/null +++ b/scripts/anomaly_detection/Contact/TimesNet_Supervised.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +export CUDA_VISIBLE_DEVICES=0 + +echo "🚀 启动基于TimesNet的监督异常检测训练..." +echo "📊 数据: 能源Contact数据集 (标注版本)" +echo "🏗️ 模型: TimesNet + 逐点分类头" +echo "🎯 任务: 逐点异常检测 (每分钟数据点分类)" + + + +python3 -u run.py \ + --task_name supervised_anomaly_detection \ + --is_training 1 \ + --root_path ./dataset \ + --model_id SupervisedContact \ + --model TimesNet \ + --data SupervisedContact \ + --features M \ + --seq_len 40 \ + --pred_len 0 \ + --d_model 64 \ + --d_ff 64 \ + --e_layers 2 \ + --enc_in 27 \ + --c_out 1 \ + --top_k 3 \ + --batch_size 32 \ + --learning_rate 0.001 \ + --train_epochs 3 \ + --patience 10 \ + --num_workers 4 \ + --des 'TimesNet_Supervised_Contact_Anomaly_Detection' \ + --itr 1 \ No newline at end of file diff --git a/scripts/anomaly_detection/PSM/TimesNet.sh b/scripts/anomaly_detection/PSM/TimesNet.sh index e972e61e8..2dd9b7bfd 100644 --- a/scripts/anomaly_detection/PSM/TimesNet.sh +++ b/scripts/anomaly_detection/PSM/TimesNet.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=6 +export CUDA_VISIBLE_DEVICES=0 python -u run.py \ --task_name anomaly_detection \ diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 000000000..9e100589e --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,56 @@ +import logging +import os +from datetime import datetime + +def setup_logger(script_function=None): + """ + 设置集中式日志配置 + + Args: + script_function: 脚本函数名,用于生成日志文件名 + + Returns: + logger: 配置好的logger实例 + """ + # 创建logs目录 + log_dir = "experiments/logs" + os.makedirs(log_dir, exist_ok=True) + + # 生成日志文件名 + if script_function is None: + script_function = "main" + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_filename = f"{script_function}_{timestamp}.log" + log_filepath = os.path.join(log_dir, log_filename) + + # 创建logger + logger = logging.getLogger(script_function) + logger.setLevel(logging.INFO) + + # 清除现有的handlers(避免重复) + logger.handlers.clear() + + # 设置日志格式 + formatter = logging.Formatter( + '[%(asctime)s] [%(levelname)s] [%(name)s] - %(message)s' + ) + + # 添加StreamHandler(输出到终端) + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.INFO) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + # 添加FileHandler(输出到文件) + file_handler = logging.FileHandler(log_filepath, encoding='utf-8') + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + logger.info(f"Logger initialized. Log file: {log_filepath}") + + return logger + +# 创建默认logger实例 +logger = setup_logger("default") \ No newline at end of file diff --git a/utils/tools.py b/utils/tools.py index 03ef974a9..6efa7129a 100644 --- a/utils/tools.py +++ b/utils/tools.py @@ -36,7 +36,7 @@ def __init__(self, patience=7, verbose=False, delta=0): self.counter = 0 self.best_score = None self.early_stop = False - self.val_loss_min = np.Inf + self.val_loss_min = np.inf self.delta = delta def __call__(self, val_loss, model, path): diff --git a/utils_own/preview_csv.py b/utils_own/preview_csv.py new file mode 100644 index 000000000..6d8fa1be7 --- /dev/null +++ b/utils_own/preview_csv.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd + +def preview_csv_file(file_path, n_rows=5): + """ + Preview a CSV file by showing first and last n rows + + Args: + file_path (str): Path to the CSV file + n_rows (int): Number of rows to display from beginning and end + """ + try: + print(f"Loading CSV file: {file_path}") + df = pd.read_csv(file_path) + + print(f"File loaded successfully. Total rows: {len(df)}, columns: {len(df.columns)}") + print(f"Column names: {list(df.columns)}") + print(f"Data types:\n{df.dtypes}") + + print(f"\n--- First {n_rows} rows ---") + print(df.head(n_rows)) + + print(f"\n--- Last {n_rows} rows ---") + print(df.tail(n_rows)) + + + + except Exception as e: + print(f"Error previewing file {file_path}: {str(e)}") + raise + +if __name__ == "__main__": + # Hardcoded file path + file_path = "dataset/PSM/train.csv" + preview_csv_file(file_path, 5) \ No newline at end of file diff --git a/utils_own/preview_parquet.py b/utils_own/preview_parquet.py new file mode 100644 index 000000000..fe66d6fe4 --- /dev/null +++ b/utils_own/preview_parquet.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd + +def preview_parquet_file(file_path, n_rows=5): + """ + Preview a Parquet file by showing first and last n rows + + Args: + file_path (str): Path to the Parquet file + n_rows (int): Number of rows to display from beginning and end + """ + try: + print(f"Loading Parquet file: {file_path}") + df = pd.read_parquet(file_path) + + print(f"File loaded successfully. Total rows: {len(df)}, columns: {len(df.columns)}") + print(f"Column names: {list(df.columns)}") + print(f"Data types:\n{df.dtypes}") + + print(f"\n--- First {n_rows} rows ---") + print(df.head(n_rows)) + + print(f"\n--- Last {n_rows} rows ---") + print(df.tail(n_rows)) + + except Exception as e: + print(f"Error previewing file {file_path}: {str(e)}") + raise + +if __name__ == "__main__": + # Hardcoded file path + file_path = "dataset/supervised/lable/train_labeled.parquet" + preview_parquet_file(file_path, 5) \ No newline at end of file