diff --git a/LR.m b/LR.m new file mode 100644 index 0000000..63f12c6 --- /dev/null +++ b/LR.m @@ -0,0 +1,57 @@ +clear all; +%标准化数据集 +data=importdata('breast.mat'); +[di,dj]=size(data); +for i=1:dj-1 + data(:,i+1)=data(:,i+1)./max(data(:,i+1)); +end +%选取样本 +for i=(di-199):di + test(i-di+200,:)=data(i,:); +end +for i=1:dj-1 + Test(:,i)=test(:,i+1); +end +learn_num=30; +for num=1:4 + number=learn_num*num; +for Ran=1:100 +y=rand(1, di-200); +[ignore,p] = sort(y); +for i=1:number + x_0(i,:)=data(p(i),:); +end +for i=1:dj-1 + x(:,i)=x_0(:,i+1); +end +%训练算法开始 +k=1; +w=1.+zeros(dj-1,1); +w0=0; +while(k<=50) + p=exp(w0+x*w); + P=p./(1+p); + for i=1:dj-1 + w(i)=w(i)+k^-1*sum(x(:,i).*(x_0(:,1)-P)); + end + w0=w0+k^-1*sum(x_0(:,1)-P); + k=k+1; +end +%测试 +error=0; +for i=1:200 + p_test=exp(Test(i,:)*w+w0); + if(p_test/(1+p_test)<0.5&test(i,1)==1) + error=error+1; + end + if(p_test/(1+p_test)>=0.5&test(i,1)==0) + error=error+1; + end +end +lev(Ran)=(200-error)/200; +end +w' +level(num)=mean(lev); +clear x x_0 w w0; +end +level \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 2ea9965..0000000 --- a/README.md +++ /dev/null @@ -1,148 +0,0 @@ -# SES2020spring - -## Introduction - -This repository contains the complementary material for 椤圭洰瀹炶返 and 鏈哄櫒瀛︿範鍩虹 in 2020 spring. -Due to the distributed and -asynchronous -nature of this study format, we will steer the course through a series of self-contained units around programming, visualization, and simulation, which may be your interest for your future endeavor or not. This course tries to fulfill this arrangement by introducing some tools and demonstrating their capacity. - -4 units are planned for the first 4 weeks. Each unit will contain at least 2 parts. The 1st part will discuss general practice and methodology around software engineering and this serves as the general topic across the board. The second part will introduce some tools or library and show how to use them. It鈥檚 highly recommended that you should do your research about this tool or library before the classroom and practice with it after the class. The classroom time will be question/answer time and I will stay online to answer your question through Github issues, QQ, and Wechat. - -## Unit 1: Introduction to Software Engineering and Git - -In the introduction to software engneering, we will cover the concept of software, software engineering, and software crisis. We also show the basic workflow of Git, the distributed version control system. - -## Unit 2: Agile Programming - -We will have 2 parts in this unit. In the first part, we go through an agile method called Scrum. In this second part, we will talk about Python. - -## Unit 3: Object Oriented Design and Programming - -## Unit 4: Unit Test - -## Final Project: - -Please form a team with 2~3 members and pick a project from http://www.cse.msu.edu/~cse231/PracticeOfComputingUsingPython/. - -## Q&A - -Q: Do we have a syllabus for this course? What's the goal of this course? - -A: For the situation caused by the Coronavirus, we do have an alternative plan for the course for the first 4 weeks. The topics are listed above. The goal of this course is to introduce several concept and practise around software development. - -Q: The videos can be more connected and the title of the video is not clear? - -A: We want each video to be self-contained and independent as possible so that referece to these video in the future makes more sense without refering to the context and the background. Acfun.cn and bilibili.com are meant to be used as video host only. Title and description for the video are only available on the site where this readme resides. This makes the material for this course neutral to video hosting site. - -Q: There are some parts where complementary slides are not available? - -A: The programming parts are mainly screen casting. - -Q: Team up may be a better choice? - -A: Exercise of some units will be for individual, while other exercises require a teamwork. - -Q: What about students with a computer at his/her disposal? - -A: We provide reference in each unit and some reading instruction. This is for students without computers. - -## 鏈哄櫒瀛︿範鍩虹 - -### 绠浠 - -1. Machine learning and related terms. -2. Types of problem discussed in machine learning. -3. Introduction to neural networks. - -[video: introduction to machine learning @acfun](https://www.acfun.cn/v/ac15348189) - -[video: introduction to machine learning @bilibili](https://www.bilibili.com/video/BV1Wz4y1R7dh/) - -The material is in c1_intro. - -### 閫昏緫鍥炲綊 - -1. logistic regression -2. stochastic gradient descent -3. input normalization -4. oversampling/undersampling - -[video for logistic regression, @acfun](https://www.acfun.cn/v/ac15394923) - -[video for logistic regression, @bilibili](https://www.bilibili.com/video/BV1Ta4y1i7E4/) - -[pdf for logistic regression](https://github.com/pipehappy1/SES2020spring/blob/master/C2_logistic_regression.pdf) - -### Exercise 1 - -The task in this exercise is for you to apply logistic regression on ``Breast Cancer Wisconsin (Diagnostic) Data Set''. - -* You can set train/test split to 0.7/0.3 or any value you like. -* You can choose your favorite tool besides PyTorch. -* Post your answer in a pull request to this repository. -* Discuss with each other while try to work the code out by yourself. - -There is a piece of code demonstrating logistic regression using PyTorch. mlex1/lr.py - -The data is available at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic) - -The data is also available on Kaggle: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data - -Note: - -* UC Irvine Machine Learning Repository is the classic goto site to find some machine learning dataset. -* Kaggle is a popular machine learning competition site. - -### 浜哄伐绁炵粡缃戠粶 - -1. Ensamble models by bagging/stacking/boosting -2. MLP by stacking logistic regression -3. Auto difference i.e. back propagation - -Videos: - -* [multiLayer-perceptron @acfun](https://www.acfun.cn/v/ac15453604) -* [multiLayer-perceptron @bilibili](https://www.bilibili.com/video/BV125411s7sH/) -* [back propagation @acfun](https://www.acfun.cn/v/ac15453964) -* [back propagation @bilibili](https://www.bilibili.com/video/BV1Ai4y147Aj/) - -pdfs: - -* [Perceptron and multiLayer-perceptron](https://github.com/pipehappy1/SES2020spring/blob/master/c3_mlp/c3_mlp.pdf) -* [Back propagation slides from Stanford cs231n](https://github.com/pipehappy1/SES2020spring/blob/master/c3_mlp/lecture_4.pdf) -* [Back propagation slides from Stanford cs231n (source)](http://cs231n.stanford.edu/slides/2020/lecture_4.pdf) - -### 鍗风Н缃戠粶 - -1. Convolutional operator in convolutional neural network (CNN) -2. Other layers often used in CNN -3. Learning tricks - -Videos: - -* [CNN @acfun](https://www.acfun.cn/v/ac15496062) -* [CNN @bilibili](https://www.bilibili.com/video/BV16K411W7ma) - -pdf: - -* [CNN](http://cs231n.stanford.edu/slides/2020/lecture_5.pdf) - -### 鍏朵粬绁炵粡缃戠粶妯″瀷 - -Videos: - -* [pretrained CNN @acfun](https://www.acfun.cn/v/ac15537219) -* [pretrained CNN @bilibili](https://www.bilibili.com/video/BV1of4y117sk) - -pdf: - -* [pretrained CNN](http://cs231n.stanford.edu/slides/2020/lecture_9.pdf) - -### Exercise 2 - -Please go through [deep learning with PyTorch: a 60 minute blitz](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) -by yourself. You can post any question in the QQ chat group if you'd like to discuss. - -* Use [conda](https://docs.conda.io/en/latest/miniconda.html) to manage and install the packages. -* Goto [PyTorch](https://pytorch.org/) to find the command to install PyTorch. diff --git a/breast_nb.m b/breast_nb.m new file mode 100644 index 0000000..688e05c --- /dev/null +++ b/breast_nb.m @@ -0,0 +1,102 @@ +clear all; +%标准化数据集 +data=importdata('breast.mat'); +[di,dj]=size(data); +learn_num=30;%采样的基数 +for num=1:4 + number=learn_num*num; +for Ran=1:100 %随即选取number个样本100次 +y=rand(1, di-200); +[ignore,p] = sort(y); +for i=1:number + x_0(i,:)=data(p(i),:);%随即选择了number个样本 +end +index_1=find(x_0(:,1)==1); +len_1=length(index_1);%计算其中属于第一类的编号和数量 +index_2=find(x_0(:,1)==2); +len_2=length(index_2);%计算其中属于第二类的标号和数量 +for i=1:len_1 + x_1(i,:)=x_0(index_1(i),:);%第一类归到x_1中 +end +for i=1:len_2 + x_2(i,:)=x_0(index_2(i),:);%第二类归到x_2中 +end +clear x_0 index_1 index_2; +for i=(di-199):di + test(i-di+200,:)=data(i,:);%选取最后100个作为测试样本 +end +%训练开始 +for i=2:dj + [x1,y1]=sort(x_1(:,i)); + k=x1(1); + z_1(1,1)=x1(1); + z_1(2,1)=1; + j=1; + for I=2:len_1 + if(x1(I)==k) + z_1(2,j)=z_1(2,j)+1; + end + if(x1(I)~=k) + k=x1(I); + j=j+1; + z_1(1,j)=k; + z_1(2,j)=1; + end + end + [n,step_1]=size(z_1); + [x2,y2]=sort(x_2(:,i)); + k=x2(1); + z_2(1,1)=x2(1); + z_2(2,1)=1; + j=1; + for I=2:len_2 + if(x2(I)==k) + z_2(2,j)=z_2(2,j)+1; + end + if(x2(I)~=k) + k=x2(I); + j=j+1; + z_2(1,j)=k; + z_2(2,j)=1; + end + end + [n,step_2]=size(z_2); + %对该特征的测试样本的概率 + for t=1:200 + for j=1:step_1 + dis1(j)=abs(test(t,i)-z_1(1,j)); + end + dis_1=find(dis1==min(dis1)); + P_1(t,i-1)=z_1(2,dis_1(1))/len_1; + for j=1:step_2 + dis2(j)=abs(test(t,i)-z_2(1,j)); + end + dis_2=find(dis2==min(dis2)); + P_2(t,i-1)=z_2(2,dis_2(1))/len_2; + end + clear x1 x2 y1 y2 z_1 z_2 dis1 dis2 dis_1 dis_2; +end +clear x_1 x_2; +error=0; +for s=1:200 + P1=1; + P2=1; + for j=1:dj-1 + P1=P1*P_1(s,j); + end + P1=P1*len_1/number; + for j=1:dj-1 + P2=P2*P_2(s,j); + end + P2=P2*len_2/number; + if(P1>P2&test(s,1)==2) + error=error+1; + end + if(P1