导航

    精算后花园

    • 注册
    • 登录
    • 搜索
    • 版块
    • 最新
    • 话题
    • 热门
    • 用户
    • 群组
    1. 主页
    2. 谢远涛
    谢
    • 资料
    • 关注
    • 粉丝
    • 主题
    • 帖子
    • 最佳
    • 群组

    谢远涛

    @谢远涛

    3
    声望
    11
    帖子
    6
    资料浏览
    1
    粉丝
    0
    关注
    注册时间 最后登录

    谢远涛 关注

    谢远涛 发布的最佳帖子

    • 再保险中高阶矩计算:对数正态分布与正态分布

      2.1 对数正态分布不完全k阶矩公式

      LN<-function(mu,sigma,L,U,k){
      Lk<-(log(L)-mu)/sigma-k*sigma
      Uk<-(log(U)-mu)/sigma-k*sigma
      exp(k*mu+k^2*sigma^2/2)*(pnorm(Uk)-pnorm(Lk))
      }
      

      verify the pdf

      LN(8.5,0.8,0,Inf,0)
      

      exam 2.1

      EX=LN(8.5,0.8,0,Inf,1)
      EX1=0.75*EX
      EX2=LN(8.5,0.8,0,25000,1)+25000*LN(8.5,0.8,25000,Inf,0)
      

      exam 2.2

      VARX=LN(8.5,0.8,0,Inf,2)-LN(8.5,0.8,0,Inf,1)^2
      VARX1=0.75^2*VARX
      EX2_SQ=LN(8.5,0.8,0,25000,2)+25000^2*LN(8.5,0.8,25000,Inf,0)
      VARX2=EX2_SQ-EX2^2
      VARX2
      

      2.2 正态分布不完全k阶矩公式

      2.2.0 正态分布不完全0阶矩公式:CDF

      pnorm(Inf)-pnorm(0)
      

      2.2.1 正态分布不完全1阶矩公式

      norm<-function(mu=0,sigma=1,L,U){
      L1<-(L-mu)/sigma
      U1<-(U-mu)/sigma
      mu*{pnorm(U1)-pnorm(L1)}-sigma*{dnorm(U1)-dnorm(L1)}
      }
      norm(,,0,1e10)
      norm(,,-Inf,Inf)
      

      2.2.2 正态分布不完全2阶矩公式

      norm2<-function(mu=0,sigma=1,L,U){
      L1<-(L-mu)/sigma
      U1<-(U-mu)/sigma
      (mu^2+sigma^2)*{pnorm(U1)-pnorm(L1)}-
      sigma*{(2*mu+sigma*U1)*dnorm(U1)-(2*mu+sigma*L1)*dnorm(L1)}
      }
      norm2(,,0,Inf)
      
      发布在 R
      谢
      谢远涛
    • 按照某个变量的值拆分数据集(为循环和批量分析服务)

      /*按照变量date的值来拆分数据集
      如date=19960131生成一个数据集,date=19960228生成一个
      现在有很多个,到20080731了
      */
      /*方法一
      思路是先把distinct date的值变为宏变量,然后用data步输出。
      反复读数据,会很慢
      */
      data a;
      do date=1991 to 2011;
      output;
      end;
      run;
      data null;
      set a end=last;
      call symputx('date'||left(n),date);
      if last then call symput ('n',n);
      run;
      %macro split;
      %do i=1 %to &n;
      data b&i;
      set a;
      where date=&&date&i;
      run;
      %end;
      %mend;
      %split;

      /方法二/
      data date;
      date=input("19960131",yymmdd8.);
      output;
      date=input("19960228",yymmdd8.);
      output;
      date=input("19960309",yymmdd8.);
      output;
      run;

      data null;
      set date;
      call execute ("data data_"||strip(put(date,yymmddn8.))||";set a;if date="||date||";run;");
      run;

      /方法三/

      data a;
      format target_day 8.;inputtargetday;
      cards;
      20120730
      20120731
      20120810
      20120817
      ;run;

      %macro segmentation();
      proc sort data = a out = work.t1(keep = target_day) nodupkey;
      by target_day;
      run;
      data null;
      set work.t1 nobs=n;
      call symputx(compress("set_"||n), put(input(target_day, yymmdd8.), yymmddn8.));
      if n = 1 then call symputx('n',n);
      run;
      data %do ii = 1 %to &n.;
      data_&&set_&ii..
      %end;;
      set a;
      select (target_day);
      %do ii = 1 %to &n.;
      when ("&&set_&ii..") output data_&&set_&ii..;
      %end;
      otherwise;
      end;
      run;
      proc sql;
      drop table work.t1;
      quit;
      %mend;
      %segmentation();

      发布在 SAS
      谢
      谢远涛
    • 罚广义线性模型:LASSO vs 岭回归

      3 罚广义线性模型 51

      install.packages("lars")
      install.packages("ridge")
      

      (1) OLS回归

      library(ridge)
      data(GenCont)
      model.ols <- lm(Phenotypes ~ ., data = as.data.frame(GenCont))
      summary(model.ols)
      

      提取回归系数

      coef.ols <- coef(model.ols)
      

      查看不等于0的回归系数

      coef.ols[coef.ols!=0]  
      

      (2) ridge regression(岭回归)

      model.rid <- linearRidge(Phenotypes ~ ., seq(0, 10, length = 100), data = as.data.frame(GenCont))
      

      查看结果

      summary(model.rid)
      

      提取系数

      coef返回的是标准化系数; 直接输入model.rid返回的是原始数据系数
      coef.rid <- coef(model.rid)
      
      model.rid2 <- linearRidge(Phenotypes ~ ., data = as.data.frame(GenCont))
      summary(model.rid2)
      coef.rid <- coef(model.rid2)
      

      查看不等于0的系数

      coef.rid[coef.rid!=0]                           
      

      (3) 做 lasso regression

      模型设定

      library(lars)
      dt<-as.matrix(GenCont)
      X<-dt[,-1]
      Y<-dt[,1]
      model.lasso1 <- lars(X, Y)    
      model.lasso2 <- lars(X, Y, type='lasso') 
      model.lasso3 <- lars(X, Y, type='for')  # Can use abbreviations
      

      看变量选入顺序

      model.lasso3
      summary(model.lasso3)
      

      Cp的含义:衡量多重共线性,其取值越小越好,这里取到第3步使得Cp值最小

      画个图

      plot(model.lasso3)
      set.seed(12345)
      

      做交叉验证

      CV.lasso <- cv.lars(X, Y, K=10)   
      
      (best <- CV.lassoindex[which.min(CV.lassocv)])
      

      选择最好的效果

      (coef.lasso <- coef.lars(model.lasso3, mode='fraction', s=best))
      

      命名

      names(coef.lasso) <- colnames(dt)[-1]
      

      查看结果

      coef.lasso[coef.lasso!=0]
      
      发布在 R
      谢
      谢远涛

    谢远涛 发布的最新帖子

    • 再保险中高阶矩计算:对数正态分布与正态分布

      2.1 对数正态分布不完全k阶矩公式

      LN<-function(mu,sigma,L,U,k){
      Lk<-(log(L)-mu)/sigma-k*sigma
      Uk<-(log(U)-mu)/sigma-k*sigma
      exp(k*mu+k^2*sigma^2/2)*(pnorm(Uk)-pnorm(Lk))
      }
      

      verify the pdf

      LN(8.5,0.8,0,Inf,0)
      

      exam 2.1

      EX=LN(8.5,0.8,0,Inf,1)
      EX1=0.75*EX
      EX2=LN(8.5,0.8,0,25000,1)+25000*LN(8.5,0.8,25000,Inf,0)
      

      exam 2.2

      VARX=LN(8.5,0.8,0,Inf,2)-LN(8.5,0.8,0,Inf,1)^2
      VARX1=0.75^2*VARX
      EX2_SQ=LN(8.5,0.8,0,25000,2)+25000^2*LN(8.5,0.8,25000,Inf,0)
      VARX2=EX2_SQ-EX2^2
      VARX2
      

      2.2 正态分布不完全k阶矩公式

      2.2.0 正态分布不完全0阶矩公式:CDF

      pnorm(Inf)-pnorm(0)
      

      2.2.1 正态分布不完全1阶矩公式

      norm<-function(mu=0,sigma=1,L,U){
      L1<-(L-mu)/sigma
      U1<-(U-mu)/sigma
      mu*{pnorm(U1)-pnorm(L1)}-sigma*{dnorm(U1)-dnorm(L1)}
      }
      norm(,,0,1e10)
      norm(,,-Inf,Inf)
      

      2.2.2 正态分布不完全2阶矩公式

      norm2<-function(mu=0,sigma=1,L,U){
      L1<-(L-mu)/sigma
      U1<-(U-mu)/sigma
      (mu^2+sigma^2)*{pnorm(U1)-pnorm(L1)}-
      sigma*{(2*mu+sigma*U1)*dnorm(U1)-(2*mu+sigma*L1)*dnorm(L1)}
      }
      norm2(,,0,Inf)
      
      发布在 R
      谢
      谢远涛
    • 按照某个变量的值拆分数据集(为循环和批量分析服务)

      /*按照变量date的值来拆分数据集
      如date=19960131生成一个数据集,date=19960228生成一个
      现在有很多个,到20080731了
      */
      /*方法一
      思路是先把distinct date的值变为宏变量,然后用data步输出。
      反复读数据,会很慢
      */
      data a;
      do date=1991 to 2011;
      output;
      end;
      run;
      data null;
      set a end=last;
      call symputx('date'||left(n),date);
      if last then call symput ('n',n);
      run;
      %macro split;
      %do i=1 %to &n;
      data b&i;
      set a;
      where date=&&date&i;
      run;
      %end;
      %mend;
      %split;

      /方法二/
      data date;
      date=input("19960131",yymmdd8.);
      output;
      date=input("19960228",yymmdd8.);
      output;
      date=input("19960309",yymmdd8.);
      output;
      run;

      data null;
      set date;
      call execute ("data data_"||strip(put(date,yymmddn8.))||";set a;if date="||date||";run;");
      run;

      /方法三/

      data a;
      format target_day 8.;inputtargetday;
      cards;
      20120730
      20120731
      20120810
      20120817
      ;run;

      %macro segmentation();
      proc sort data = a out = work.t1(keep = target_day) nodupkey;
      by target_day;
      run;
      data null;
      set work.t1 nobs=n;
      call symputx(compress("set_"||n), put(input(target_day, yymmdd8.), yymmddn8.));
      if n = 1 then call symputx('n',n);
      run;
      data %do ii = 1 %to &n.;
      data_&&set_&ii..
      %end;;
      set a;
      select (target_day);
      %do ii = 1 %to &n.;
      when ("&&set_&ii..") output data_&&set_&ii..;
      %end;
      otherwise;
      end;
      run;
      proc sql;
      drop table work.t1;
      quit;
      %mend;
      %segmentation();

      发布在 SAS
      谢
      谢远涛
    • SAS结果仿STATA输出&内生性问题
      ods rtf file="D:\_云同步\_论文\20171215税收对收入流动性\1_1家庭税收总额作为核心自变量.rtf";
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_1: model  MT= labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_new;
      ODS OUTPUT ParameterEstimates =MN_1;
      RUN;
      
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_2: model MT= labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_new  tax_new2;
      ODS OUTPUT ParameterEstimates =MN_2;
      RUN;
      
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_3: model MT=labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_tod;
      ODS OUTPUT ParameterEstimates =MN_3;
      RUN;
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_4: model MT= labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_tod  tax_tod2;
      ODS OUTPUT ParameterEstimates =MN_4;
      RUN;
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_5: model MT= labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_old;
      ODS OUTPUT ParameterEstimates =MN_5;
      RUN;
      proc syslin data=family LIML;
      endogenous  incstr;
      instruments tax_level urban jobclass_hz labor;
      MN_6: model MT=labor labor2  Incstr  asset  house  meaneduy  experience_hz  experience_hz2  tax_old  tax_old2;
      ODS OUTPUT ParameterEstimates =MN_6;
      RUN;
      ods rtf close;
      
      /*Tax_new Tax_old 出现共线性 tax_old        = +1.0000 * tax_new +1.0000 * shock3                                                           */
      data MN_1;
      	rename Variable=Parameter;
      	length Variable  15;setMN1;attribparamlabel=′M1′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      data MN_2;
      	rename Variable=Parameter;
      	length Variable  15;setMN2;attribparamlabel=′M2′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      data MN_3;
      	rename Variable=Parameter;
      	length Variable  15;setMN3;attribparamlabel=′M3′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      data MN_4;
      	rename Variable=Parameter;
      	length Variable  15;setMN4;attribparamlabel=′M4′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      data MN_5;
      	rename Variable=Parameter;
      	length Variable  15;setMN5;attribparamlabel=′M5′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      data MN_6;
      	rename Variable=Parameter;
      	length Variable  15;setMN6;attribparamlabel=′M6′length=20;
      	if Probt<=0.01 then param=strip(put(estimate,6.3))||' ***  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.05 then param=strip(put(estimate,6.3))||' **  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else if Probt<=0.1 then param=strip(put(estimate,6.3))||' *  ('||strip(put(Estimate/StdErr,6.2))||')';
      	else param=strip(put(estimate,6.3))||'  ('||strip(put(Estimate/StdErr,6.2))||')';
      run;
      
      PROC SQL;
      CREATE TABLE CUI.T1_ADJ510 AS
      SELECT coalesce(A.Parameter,B.Parameter,C.Parameter,D.Parameter,E.Parameter,F.Parameter) as Parameter,
      A.param AS MN_1,B.param AS MN_2,C.param AS MN_3,D.param AS MN_4,E.param AS MN_5,F.param AS MN_6
      FROM MN_1 AS A 
      FULL JOIN MN_2 AS B
      ON A.Parameter=B.Parameter
      FULL JOIN MN_3 AS C
      ON A.Parameter=C.Parameter OR B.Parameter=C.Parameter
      FULL JOIN MN_4 AS D
      ON A.Parameter=D.Parameter OR B.Parameter=D.Parameter OR C.Parameter=D.Parameter
      FULL JOIN MN_5 AS E
      ON A.Parameter=E.Parameter OR B.Parameter=E.Parameter OR C.Parameter=E.Parameter OR D.Parameter=E.Parameter
      FULL JOIN MN_6 AS F
      ON A.Parameter=F.Parameter OR B.Parameter=F.Parameter OR C.Parameter=F.Parameter OR D.Parameter=F.Parameter OR E.Parameter=F.Parameter
      ;
      QUIT;
      
      PROC EXPORT DATA= CUI.T1_ADJ510
                  OUTFILE= "D:\_云同步\_论文\20171215税收对收入流动性\T1_家庭税收总额作为核心自变量.csv" 
                  DBMS=CSV REPLACE;
           PUTNAMES=YES;
      RUN;
      
      #也可以封装为宏块结构:
      /*6 个*/
      %MACRO Merge_6(OUT=T19 , A=,B=,C=,D=,E=,F=);
      PROC SQL;
      CREATE TABLE &OUT AS
      SELECT coalesce(A.Parameter,B.Parameter,C.Parameter,D.Parameter,E.Parameter,F.Parameter) as Parameter,
      A.param AS &A,B.param AS &B,C.param AS &C,D.param AS &D,
      E.param AS &E,F.param AS &F
      FROM &A AS A 
      FULL JOIN &B AS B
      ON A.Parameter=B.Parameter
      FULL JOIN &C AS C
      ON A.Parameter=C.Parameter OR B.Parameter=C.Parameter
      FULL JOIN &D AS D
      ON A.Parameter=D.Parameter OR B.Parameter=D.Parameter OR C.Parameter=D.Parameter
      FULL JOIN &E AS E
      ON A.Parameter=E.Parameter OR B.Parameter=E.Parameter OR C.Parameter=E.Parameter OR D.Parameter=E.Parameter
      FULL JOIN &F AS F
      ON A.Parameter=F.Parameter OR B.Parameter=F.Parameter OR C.Parameter=F.Parameter OR D.Parameter=F.Parameter OR E.Parameter=F.Parameter
      ;
      quit;
      
      PROC EXPORT DATA= &OUT
                  OUTFILE= "D:\_云同步\_论文\20171215税收对收入流动性\&OUT..csv" 
                  DBMS=CSV REPLACE;
           PUTNAMES=YES;
      RUN;
      %MEND;
      
      发布在 SAS
      谢
      谢远涛
    • SAS IML交互式数据处理及小实例
      /*3 交互式矩阵语言 IML*/
      /*3.1  概述*/
      Proc IML;
      	Sc=15.25;
      	Vh={1 2};
      	Print Sc;
      	Print Vh;
      Quit;
      
      /*3.2  数据集操作语句*/
      Proc IML;
      filename out 'user.matrix'; 
      file out; 
          do i=1 to nrow(x); 
             do j=1 to ncol(x); 
                put (x[i,j]) 1.0 +2 @; 
             end; 
           	put;
      	end; 
         closefile out;
      QUIT;
      
      /*3.3  循环语句*/
      /*3.4  分支语句*/
      /*3.5  CALL语句*/
      /*3.5  实例*/
      
      /*实例1:方程组求解*/
      
      PROC IML;
      		A={2 3 1, 4 -1 2, 3 -1 1};
      		b={10, 13,8};
      		X=INV(A)*b;
      		PRINT A b X;
      QUIT;
      
      /*	实例2:非线性方程的迭代求解*/
      PROC IML;
        	maxiter=50; /* 定义最大迭代次数 */ 
          converge=.000001; /* 定义收敛精度*/ 
             /* 定义待求解函数 */ 
      start fun; 
            f= x**3-x+2; 
         finish fun; 
             /* 定义导函数 */ 
         start deriv; 
            df=3*x**2-1; 
         finish deriv; 
      start newton; 
            run fun;          
            do iter=1 to maxiter  /* 迭代直到最大迭代次数 */ 
            while(max(abs(f))>converge);            /* 收敛则提前退出 */ 
               run deriv;                /* 计算导函数 */ 
               delta=f/df;      /* 计算增量 */ 
               x=x-delta;                    /* 更新值 */ 
               run fun;                      /* 计算函数值 */ 
           end; 
         finish newton; 
      do; 
            print "Solving the equation:" ,; 
            x=-1;     /* 赋初值 */ 
            run newton; 
      	  iter=iter-1;
      	  print x f iter; 
      	  if iter>maxiter then print 'Not converge!';
         end;
      QUIT;
      
      /*实例3:把环比指数转化为定基指数*/
      data a;
      input id index;
      cards;
      1 100
      1 93
      1 103
      1 97
      2 100
      2 97
      2 116
      2 98
      run;
      proc iml;
      use a;
      read all into mat;
      index=j(8,1,1);
      index[1]=mat[1,2];
      do i=2 to 8 by 1;
         if (mat[i,1]=mat[i-1,1]) then index[i]=index[i-1]*mat[i,2];
         else index[i]=mat[i,2]; 
      end;
       print index;
      create b from index;
      append from index;
      close b;
      run;
      
      发布在 SAS
      谢
      谢远涛
    • SAS ODS输出导出系统
      /*4 ODS输出传递系统*/
      /*4.1  RTF输出*/
      ods rtf ;
      proc means data=xie.student;
      var math phys chem literat history english;
      run;
      ods rtf close;
      
      /*4.2  HTML输出*/
      ods html  body= 'odshtml_body.htm' ;
      ods html  file= 'odshtml_body.htm' 
      contents='odshtml_contents.htm'
      page='odshtml_page.htm'
      frame='odshtml_frame.htm';
      proc univariate data=xie.student;
      var math phys chem literat history english;
      title;
      run;
      ods html close;
      
      
      Ods html body='E:\SAS培训\bclass.html';
      Proc print data=xie.class label;
      Var  sex age height;
      Run;
      Ods html close;
      
      /*4.3  PDF输出*/
      Ods pdf file='E:\SAS培训\bclass.pdf';
      Proc print data=xie.class label;
      Var  sex age height;
      Run;
      Ods pdf close;
      
      /*4.4  数据文件输出*/
      proc univariate data=xie.student;
      var math phys chem literat history english;
      ods output 	Moments=Moments;
      run;
      
      /*4.5  图形输出*/
      
      
      data Sheets;
      input Distance @@;
      label Distance='Hole Distance in cm';
      datalines;
          9.80 10.20 10.27  9.70  9.76
         10.11 10.24 10.20 10.24  9.63
          9.99  9.78 10.10 10.21 10.00
          9.96  9.79 10.08  9.79 10.06
         10.10  9.95  9.84 10.11  9.93
         10.56 10.47  9.42 10.44 10.16
         10.11 10.36  9.94  9.77  9.36
          9.89  9.62 10.05  9.72  9.82
          9.99 10.16 10.58 10.70  9.54
         10.31 10.07 10.33  9.98 10.15
      ;
      run;
      title 'Normal Probability-Probability Plot for Hole Distance';
      ods graphics on;
      proc univariate data=Sheets noprint;
      ppplot Distance / normal(mu=10 sigma=0.3)
                              square;
      run;
      
      
      data Plates;
      label Gap = 'Plate Gap in cm';
      input Gap @@;
      datalines;
         0.746  0.357  0.376  0.327  0.485 1.741  0.241  0.777  0.768  0.409
         0.252  0.512  0.534  1.656  0.742 0.378  0.714  1.121  0.597  0.231
         0.541  0.805  0.682  0.418  0.506 0.501  0.247  0.922  0.880  0.344
         0.519  1.302  0.275  0.601  0.388 0.450  0.845  0.319  0.486  0.529
         1.547  0.690  0.676  0.314  0.736 0.643  0.483  0.352  0.636  1.080
         ;
      run;
      title 'Distribution of Plate Gaps';
      ods select ParameterEstimates GoodnessOfFit FitQuantiles MyHist;
      proc univariate data=Plates;
      var Gap;
      histogram / midpoints=0.2 to 1.8 by 0.2
                        lognormal
                        weibull
                        gamma
                        vaxis   = axis1
                        name    = 'MyHist';
      inset n mean(5.3) std='Std Dev'(5.3) skewness(5.3)
                   / pos = ne  header = 'Summary Statistics';
      axis1 label=(a=90 r=0);
      run;
      
      发布在 SAS
      谢
      谢远涛
    • 经济不平等分析:罗伦兹曲线与gini系数
      /*罗伦兹曲线与gini系数*/
      data taxf;
      	infile 'B:\SAS培训\taxf.txt';
      	input TIncome TTax RCode;
      	label TIncome='总收入';
      	label TTax='总纳税额';
      	label RCode='注册类型';
      run;
      data taxf;
      	set taxf;
      	if tincome =. then delete;
      	label rcode="code for register";
      run;
      proc gchart data=taxf;
      	vbar tincome;
      	pie rcode;
      run;
      proc gplot ;
      	plot ttax*tincome;
      run;
      
      data a;
      	SET taxf;
      proc sort;
      	by descending tincome;
      run;
      data a;
      	set a;
      	retain m 0;/*按照收入由高到底的顺序逐步汇总总收入*/
      	retain n 0;
      	 m=m+tincome; n=n+ttax;id=_n_;
      run;
      proc sort data=a;
      	BY descending id;
      	run;
      data b;
      	set a;
      	retain total_m total_n;
      	if _n_=1 then do total_m=m; total_n=n;end;
      		percent_i=m/total_m;
      		percent_t=n/total_n;
      		p=percent_i;
      	run;
      
      proc sort data=b;
      	by 	percent_i;
      run;
      proc gplot data=b;
      	symbol  i=join v=none line=1 width=2;/*color是数据线的颜色,i表示是否连接,v表示数据点用不用特殊符号标记,L表示线型,WIDTH表示数据线的厚度*/
      	plot (percent_i )* (percent_t p)/overlay;
      run;
      proc sql;
      select sum(percent_t-percent_i)/sum(percent_t)
      from b;
      quit;
      
      发布在 SAS
      谢
      谢远涛
    • SAS 宏
      /*2 宏Macro*/
      /*2.1 宏变量*/
      %let name1=name;
      %let name2=‘name’;
      proc print data=xie.class;
      where height=67;
      title “&name1 ‘s height is 67.”;
      title2 “&name2 ‘s height is 67.”;
      run;
      
      /*2.2  定义宏*/
      /*参数宏*/
      %macro try(proc,dsn);
      proc &proc data=&dsn;
      run;
      %mend try;
      %try(print,xie.class);
      
      /*无参数宏*/
      %macro plot;/*开始定义宏:PLOT*/
      proc plot;
      plot income*age;
      run;
      %mend plot; /*结束宏的定义*/
      data newdata; /*创建数据集*/
        input  name $ age income;
      cards;
      xie 26 3200
      wang 23 4400
      wu 33 3872
      zhou 19 5400
      meng 22 5580
      ; 
      run;
      %plot/*调用宏,绘制收入关于年龄的散点图*/
      proc print;run;
      data newdata2; /*筛选年龄大于20的记录,建立新数据集*/
      set newdata;
      if age>20;Run;
      %plot/*再次调用宏,绘制收入关于年龄的散点图*/
      proc print;
      run;
      
      /*2.3 宏函数*/
      /*2.4 宏表达式*/
      /*2.5 宏语句*/
      /*2.6 案例*/
      
      /* The Following macro function can find the first position of "exception" in "source" string  */
      %macro indexc(source, exception,delimiter=%str( ));
      /* the leading and tailling blanks will be delete by way of the following statement */
      %let exception =%trim(%left(&exception));
      %local count word result temp;
      %let count=1;
      %let temp=0;
      %let result=0;
      %let word=%qscan(&exception,&count,&delimiter);
      
      %do %while (&word ne);
      	%if (%index(&source,&word) ne 0) %then %do;
      	%let temp=%index(&source,&word);
      	%if (&temp < &result) or (&result = 0) %then %let result=&temp;%end;
      	%let count=%eval(&count+1);
      	%let word=%qscan(&exception,&count,&delimiter);
      %end;
      &result
      %mend indexc;
      
      %indexc(am, I am a teacher,delimiter=%str( ));
      %macro a;
      %local z;
      %let z=%indexc( xie18 - xie6, 1 2 3 4 5 6 7 8 9 0,delimiter=%str( ));
      %put &z;
      %mend a;
      %a;
      
      /*案例二*/
      /*  comma split  */
      %macro split(indata,outdata);
      data &outdata ;
      	set &indata;
      	length str $10;
      	str=trim(left(etiology)); 
      	%do i=1 %to 2;
      		len=length(str); 
      		if trim(left(str))='' then	len=0;
      sep_xie=index(str,","); 
      		if sep_xie=0  then do;
      			if &i=1 then do;
      				etiology_&i=str+0;
      				str='';end;
      			else etiology_&i=.;
      			end;
      if sep_xie=1 then do;
      			etiology_&i=.;str=left(substr(str,sep_xie+1)); 
      		end;
      		if sep_xie>1 then do; 
      		   etiology_&i=left(substr(str,1,sep_xie-1))+0;
      
      str=left(substr(str,sep_xie+1)); 
      		end;
      	%end;
      	rename str=etiology_code;
      	drop len sep_xie;
      run;
      %mend split;
      %split(xie.study,study);
      
      发布在 SAS
      谢
      谢远涛
    • SAS SQL语句
      /*1 SQL查询*/
      /*1.1 简介*/
      PROC SQL;
      CREATE TABLE CLASS AS
      SELECT height,weight,name  FROM XIE.CLASS
      WHERE SEX="男";
      QUIT;
      
      /*1.2 单表查询*/
      PROC SQL;
      CREATE TABLE A1 AS
      SELECT height,weight,name  
      FROM XIE.CLASS
      WHERE age>12
      GROUP BY sex
      ORDER BY height;
      
      SELECT name, height 
      FROM XIE.CLASS 
      WHERE name='托马斯';
      
      /*(一)选择列表SELECT*/
      /*1、选择所有列 */
      SELECT * 
      FROM xie.class;
      
      /*2、选择部分列并指定它们的显示次序 */
      SELECT name,height 
      FROM xie.class; 
      
      /*3、更改列标题 */
      SELECT name as 姓名, height as 身高
      FROM xie.class;
      
      /*4、删除重复行 */
      SELECT ALL SEX,AGE
      FROM xie.class;  
      SELECT DISTINCT SEX,AGE
      FROM xie.class; 
      
      /*5、选择不同的行 */
      SELECT DISTINCT age 
      FROM xie.class;
      
      /*
      data xie.class_score ;
      merge xie.class xie.student;
      if name="" then delete;
      drop sex age height weight;
      run;
      */
      
      proc sql noprint;
      select age, sex
      into :age, :sex
      from xie.class
      where name="简";
      
      %put &age &sex;
      
      /*(二)FROM子句*/
      PROC SQL;
      SELECT class_score.name,sex 
      FROM xie.class,xie.class_score 
      WHERE class.name=class_score.name; 
      
      
      SELECT a.name,sex 
      FROM xie.class AS a,xie.class_score AS b
      WHERE a.name=b.name; 
      
      SELECT a.name,sex 
      FROM xie.class  a,xie.class_score b
      WHERE a.name=b.name; 
      
      SELECT a.name,b.math,total.sum 
      FROM xie.class  a,xie.class_score b,
      (SELECT name,sum(math,phys,chem,literat,history,english) AS sum
      FROM xie.class_score ) AS total 
      WHERE a.name=b.name=total.name AND total.sum>360;
      
      /*(三)使用WHERE子句设置查询条件*/
      SELECT * 
      FROM xie.class 
      WHERE age>12; 
      
      SELECT * 
      FROM xie.class 
      WHERE age between 10 and 30; 
      
      SELECT * 
      FROM xie.class 
      WHERE age in(12,13,14); 
      
      SELECT * 
      FROM xie.class 
      WHERE name like "%%斯"; 
      
      
      SELECT * 
      FROM xie.class 
      WHERE age not is null; 
      
      /*(四)查询结果分组GROUP BY、排序ORDER BY*/
      SELECT * 
      FROM xie.class 
      ORDER BY age desc,name ASC ;
      
      
      SELECT * 
      FROM xie.class 
      GROUP BY sex ;
      
      /*(五)函数*/
      
      /*(六)其他语句*/
      /*1.CREATE(创建表, 索引, 视图, 同义词, 过程, 函数, 数据库链接)*/
      CREATE TABLE a1 AS
      SELECT *
      FROM xie.class;
      
      /*2.ALTER(改变表, 索引, 视图等)*/
      /*在表的后面增加一个字段*/
      ALTER TABLE a1 ADD average num;
      /*删除一个字段*/
      ALTER TABLE a1 drop average;
      
      
      DROP TABLE a1;
      
      CREATE TABLE a1 AS
      SELECT *
      FROM xie.class;
      DELETE FROM a1;
      
      INSERT INTO A1 VALUES ( 'ZHANG','男',12,66,120);
      
      UPDATE A1 
      SET SEX='女'
      WHERE NAME='ZHANG';
      
      
      /*1.3 跨表查询*/
      CREATE TABLE a2 AS
      SELECT a.name,a.sex,b.sum
      FROM xie.class a,
      (SELECT name,sum(math,phys,chem,literat,history,english) AS sum
      FROM xie.class_score ) AS b
      WHERE a.name=b.name ;
      
      /*1.4 嵌套查询*/
      
      /*1.5 案例*/
      proc sql;
      /*创建表*/
      create table student
      (Sno char(4)NOT NULL UNIQUE,/*列级完整性约束条件:取值唯一,不许缺失*/
      Sname CHAR(20)  UNIQUE,/*列级完整性约束条件:取值唯一*/
      Ssex char(1),
      Sdept char(3),
      Salary num,
      Birth num informat=date7.
      format=date7.,
      Hired num informat=date7.
      format=date7.);
      /*写入信息*/
         insert into student
       values('1639', '李四', 'F','TA1',42260,'26JUN70'd,'28JAN91'd)
       values('1065', '张立', 'M','ME3',38090,'26JAN54'd,'07JAN92'd)
       values('1400', '李勇', 'M','ME1',29769.'05NOV67'd,'16OCT90'd)
       values('1561' ,'刘晨', 'M',null,36514,'30NOV63'd,'07OCT87'd)
       values('1221', '王敏', 'F','FA3',.,'22SEP63'd,'04OCT94'd ) ;
      /*修改*/
      alter table student
      add  age num;
      /*创建查询*/
      title 'PROCLIB.PAYLIST Table';
      select *
      from student;
      /*删除表*/
      drop table student;
      quit;
      
      发布在 SAS
      谢
      谢远涛
    • 朴素贝叶斯

      4 朴素贝叶斯

      # install.packages("e1071")
      library(e1071)
      data(iris)
      dt<-iris
      head(dt)
      naiveBayes.model1<- naiveBayes(Species ~ Petal.Length + Petal.Width+Sepal.Length + Sepal.Width, dt)
      naiveBayes.model1
      

      预测结果

      predict <- predict(naiveBayes.model1,newdata=dt) # 训练集数据
      

      构建混淆矩阵

      table <- table(actual=dt$Species,predict=predict)
      

      计算误判概率

      (err <- paste0(round((sum(table)-sum(diag(table)))*100/sum(table),2),"%"))
      
      发布在 R
      谢
      谢远涛
    • 罚广义线性模型:LASSO vs 岭回归

      3 罚广义线性模型 51

      install.packages("lars")
      install.packages("ridge")
      

      (1) OLS回归

      library(ridge)
      data(GenCont)
      model.ols <- lm(Phenotypes ~ ., data = as.data.frame(GenCont))
      summary(model.ols)
      

      提取回归系数

      coef.ols <- coef(model.ols)
      

      查看不等于0的回归系数

      coef.ols[coef.ols!=0]  
      

      (2) ridge regression(岭回归)

      model.rid <- linearRidge(Phenotypes ~ ., seq(0, 10, length = 100), data = as.data.frame(GenCont))
      

      查看结果

      summary(model.rid)
      

      提取系数

      coef返回的是标准化系数; 直接输入model.rid返回的是原始数据系数
      coef.rid <- coef(model.rid)
      
      model.rid2 <- linearRidge(Phenotypes ~ ., data = as.data.frame(GenCont))
      summary(model.rid2)
      coef.rid <- coef(model.rid2)
      

      查看不等于0的系数

      coef.rid[coef.rid!=0]                           
      

      (3) 做 lasso regression

      模型设定

      library(lars)
      dt<-as.matrix(GenCont)
      X<-dt[,-1]
      Y<-dt[,1]
      model.lasso1 <- lars(X, Y)    
      model.lasso2 <- lars(X, Y, type='lasso') 
      model.lasso3 <- lars(X, Y, type='for')  # Can use abbreviations
      

      看变量选入顺序

      model.lasso3
      summary(model.lasso3)
      

      Cp的含义:衡量多重共线性,其取值越小越好,这里取到第3步使得Cp值最小

      画个图

      plot(model.lasso3)
      set.seed(12345)
      

      做交叉验证

      CV.lasso <- cv.lars(X, Y, K=10)   
      
      (best <- CV.lassoindex[which.min(CV.lassocv)])
      

      选择最好的效果

      (coef.lasso <- coef.lars(model.lasso3, mode='fraction', s=best))
      

      命名

      names(coef.lasso) <- colnames(dt)[-1]
      

      查看结果

      coef.lasso[coef.lasso!=0]
      
      发布在 R
      谢
      谢远涛