*Summary statistics;
*Created 05DEC2022;
* Save the output as PDF;
ODS PDF file="Summarystats_output.pdf";
*Create the library and read in the datasets;
* Assign the working directory to a library;
LIBNAME class "C:\Users\jpark8\OneDrive - Kent State
University\Documents\Econ62052\CLASSES\1202\";


PROC FORMAT CNTLIN=class.Hw8_ipumstimeuse_formats;
RUN; 

options fmtsearch=(class ); 


* Make a duplicate file with a shorter name (still in CLASS library though) and LOOK;
DATA compustat;
  SET class.Sas_ho_compustat_datafile;
run;
PROC CONTENTS order=varnum;
  title "Checking the compustat file first";
run;
*===================================================================
* A. create categorical variables;
*===================================================================;
*** 1. 1-digit NAICS code: NAICS is 6-digit character----------------;
** a. Create a value label (format);
PROC FORMAT;
  VALUE naics1_lbl
1 = "1. Agriculture, Forestry, Fishing and Hunting"
2 = "2. Mining, Extraction, Utilities"
3 = "3. Manufacturing"
4 = "4. Trade, Warehousing, Transportation"
5 = "5. Professional Services"
6 = "6. Education, Healthcare, and Social Services"
7 = "7. Recreation/Leisure Services"
8 = "8. Other Services"
9 = "9. Public Administration";
run;

** Make naics1 and assign the value label;
DATA compustat;
  SET compustat;
  naics1=INPUT(substr(naics,1,1), 1.);
  FORMAT naics1 naics1_lbl.;
  LABEL naics1 = "1-digit NAICS industry";
PROC FREQ;
  TABLE naics1;
  title "1-digit NAICS industry group";
run;
* b. Display the value label naics1_lbl;
PROC FORMAT FMTLIB LIB=Work.Formats; /* Here, you must indicate the "Work: library */
  SELECT Naics1_lbl;
  title "naics1_lbl format";
run;

* c. Tabulate the share of each naics1 in the entire sample.;
PROC FREQ;
  TABLE naics1;
  title "share of each naics1 in the entire sample";
run;
* d. Tabulate the share of each naics1 in year 2015.;
PROC FREQ;
  TABLE naics1;
  where year=2015;
  title "share of each naics1 in year 2015";
run;
*** 2. REGION----------------------------------------------------------;
** first make the format;
PROC FORMAT;
  VALUE region_lbl
  1 = "Northeast"
  2 = "Midwest"
  3 = "South"
  4 = "West";
run;
** Make the region variable and assign the format directly;
DATA compustat;
   SET compustat;
   IF state in ("CT","MA","ME","NH","NJ","NY","PA","RI","VT") THEN region=1;
   IF state in ("IA","IL","IN","KS","MI","MN","MO","ND","NE","OH","SD","WI") THEN region=2;
   IF state in ("AL","AR","DC","DE","FL","GA","KY","LA","MD","MS","NC","OK","SC","TN","TX","VA","WV")
     THEN region=3;
   IF state in ("AK","AZ","CA","CO","HI","ID","MT","NM","NV","OR","UT","WA","WY") THEN region=4;
  FORMAT region region_lbl.;
PROC FREQ;
  TABLE region;
  title "Geographic regions";
run;
title;

*B Produce the following summary statistics;
*1. What years are in the sample period;
title 'Sample period';
proc freq;
  table year;
run;
title;

title 'How many unique companies in the sample';
proc sql;
  select count(distinct(company)) as uniquecomp
  from compustat;
quit;
*20,010 unique names;
title;


PROC SQL;
  CREATE TABLE compustat AS
  SELECT *, count(distinct company) AS uniquecomp_conm
  FROM compustat
  GROUP BY conm;
quit;
PROC FREQ;
TABLE uniquecomp_conm;
title "Number of unique cleaned company names (company) within conm";
run;
*There is one company value per each conm ;

*** B4. Number of unique companies by industry groups (1-digit NAICS);
PROC SQL;
  CREATE TABLE compustat AS
  SELECT *, count(distinct company) AS ind_uniquecomp
  FROM compustat
  GROUP BY naics1;
quit;
PROC MEANS mean missing nonobs maxdec=0;
  VAR ind_uniquecomp;
  CLASS naics1 ;
  title "Number of unique companies for each 1-digit NAICS industry";
run;
/* 1. AGR 450 unique companies, etc. */
*** B5. Frequency of each company.
- The sample period is 2001 - 2020, but not every company is alive in all 20 years. Count the
frequency of each company. Then tabulate how many companies appear once, twice, …, 20 times. ;
PROC SQL;
  CREATE TABLE compustat AS
  SELECT *, count(year) AS compfreq
  FROM compustat
  GROUP BY company;
quit;
PROC FREQ;
  TABLE compfreq;
  title "Frequency of each company name (company) - Number of years it appears";
run;


*** B6. Company size statistics by industry groups
- For each 1-digit NAICS industry, calculate the mean, standard deviation,
maximum, and minimum company size based on employment.
- Tabulate this in 4x9 table (5 x 10 including the headings) showing each
industry in each row. ;
PROC SQL;
  CREATE TABLE compustat AS
  SELECT *,
  mean(emp) as sizemean_n1,
  std(emp) as sizesd_n1,
  max(emp) as sizemax_n1,
  min(emp) as sizemin_n1
  FROM compustat
  GROUP BY naics1;
quit;


PROC MEANS data=compustat mean nonobs;
  VAR sizemean_n1 sizesd_n1 sizemax_n1 sizemin_n1;
  FORMAT sizemean_n1 sizesd_n1 8.2 sizemin_n1 sizemax_n1 8.0;
  CLASS naics1;
  title "Company size stats by industry groups";
run;

*** B7. Company size statistics by industry groups and region
- Calculate the mean, maximum, and minimum employment as above for each naics1 and region pair
- Tabulate them in 12 x 9 table showing industry groups in rows and the regional stats in columns.;
* Easy way;
PROC MEANS data=compustat nonobs mean std min max maxdec=4;
VAR emp;
CLASS region naics1;
output out=quest7;
title "Company size stats by industry groups and regions, using PROC MEANS";
run;
/* This needs a bit of editing, but minor. */
data quest7;
  set quest7 (drop=_type_ _freq_);
  if region eq . or naics1 eq . then delete; 
if _stat_ in ('MIN','MAX','MEAN'); 
if _stat_ ='MEAN' the ord=1;
if _stat_='MAX' then ord=2;
if _stat_='MIN' then ord=3;
run;

proc sort data=quest7;
  by  naics1;
run;
proc transpose data=quest7 out=quest7_final (drop=_name_ _label_ );
  by naics1;
  var emp;
  id region _stat_;
run;


proc print data=quest7_final;
title'Company size statistics by industry groups and region';
run;
title;
*question 8;
DATA compustat;
  SET class.Sas_ho_compustat_datafile;
run;
** Make naics1 and assign the value label;
DATA compustat;
  SET compustat;
  naics1=INPUT(substr(naics,1,1), 1.);
  FORMAT naics1 naics1_lbl.;
  LABEL naics1 = "1-digit NAICS industry";
run;
*Employment growth statistics;
*Calculate the annual employemnet growth using the lagged variable;
PROC MEANS nonobs sum maxdec=4;
  VAR emp;
  CLASS year;
  title "annual employemnet growth using the lagged variable";
run;

PROC MEANS data=compustat nonobs sum mean maxdec=4;
  VAR emp;
  CLASS year naics1;
  title " industry-level average annual employment growth for the whole sample period and
for each year. ";
run;

proc sort data=compustat ;
  by year;
run;

proc means data=compustat nonobs mean maxdec=4 noprint;  
  var emp;
  class year naics1;
  where year in (2000,2002,2004,2006,2008,2010,2012,2014,2016,2018,2020);
  output out=stat1;
run;

data stat2;
  set stat1;
  if year eq . or naics1 eq . then delete; 
if _stat_='MEAN'; 
run;
proc sort data=stat2;
  by naics1;
run;
proc transpose data=stat2 out=stat3 (drop=_name_ _label_ );
  by naics1;
  var emp;
  id year;
run;

proc print data=stat3;
title'Even number years and the sample period';
run;
title;

** THE END of the handout **
* Save the output as a PDF file;
ODS PDF close;