*Summary statistics; *Created 05DEC2022; * Save the output as PDF; ODS PDF file="Summarystats_output.pdf"; *Create the library and read in the datasets; * Assign the working directory to a library; LIBNAME class "C:\Users\jpark8\OneDrive - Kent State University\Documents\Econ62052\CLASSES\1202\"; PROC FORMAT CNTLIN=class.Hw8_ipumstimeuse_formats; RUN; options fmtsearch=(class ); * Make a duplicate file with a shorter name (still in CLASS library though) and LOOK; DATA compustat; SET class.Sas_ho_compustat_datafile; run; PROC CONTENTS order=varnum; title "Checking the compustat file first"; run; *=================================================================== * A. create categorical variables; *===================================================================; *** 1. 1-digit NAICS code: NAICS is 6-digit character----------------; ** a. Create a value label (format); PROC FORMAT; VALUE naics1_lbl 1 = "1. Agriculture, Forestry, Fishing and Hunting" 2 = "2. Mining, Extraction, Utilities" 3 = "3. Manufacturing" 4 = "4. Trade, Warehousing, Transportation" 5 = "5. Professional Services" 6 = "6. Education, Healthcare, and Social Services" 7 = "7. Recreation/Leisure Services" 8 = "8. Other Services" 9 = "9. Public Administration"; run; ** Make naics1 and assign the value label; DATA compustat; SET compustat; naics1=INPUT(substr(naics,1,1), 1.); FORMAT naics1 naics1_lbl.; LABEL naics1 = "1-digit NAICS industry"; PROC FREQ; TABLE naics1; title "1-digit NAICS industry group"; run; * b. Display the value label naics1_lbl; PROC FORMAT FMTLIB LIB=Work.Formats; /* Here, you must indicate the "Work: library */ SELECT Naics1_lbl; title "naics1_lbl format"; run; * c. Tabulate the share of each naics1 in the entire sample.; PROC FREQ; TABLE naics1; title "share of each naics1 in the entire sample"; run; * d. Tabulate the share of each naics1 in year 2015.; PROC FREQ; TABLE naics1; where year=2015; title "share of each naics1 in year 2015"; run; *** 2. REGION----------------------------------------------------------; ** first make the format; PROC FORMAT; VALUE region_lbl 1 = "Northeast" 2 = "Midwest" 3 = "South" 4 = "West"; run; ** Make the region variable and assign the format directly; DATA compustat; SET compustat; IF state in ("CT","MA","ME","NH","NJ","NY","PA","RI","VT") THEN region=1; IF state in ("IA","IL","IN","KS","MI","MN","MO","ND","NE","OH","SD","WI") THEN region=2; IF state in ("AL","AR","DC","DE","FL","GA","KY","LA","MD","MS","NC","OK","SC","TN","TX","VA","WV") THEN region=3; IF state in ("AK","AZ","CA","CO","HI","ID","MT","NM","NV","OR","UT","WA","WY") THEN region=4; FORMAT region region_lbl.; PROC FREQ; TABLE region; title "Geographic regions"; run; title; *B Produce the following summary statistics; *1. What years are in the sample period; title 'Sample period'; proc freq; table year; run; title; title 'How many unique companies in the sample'; proc sql; select count(distinct(company)) as uniquecomp from compustat; quit; *20,010 unique names; title; PROC SQL; CREATE TABLE compustat AS SELECT *, count(distinct company) AS uniquecomp_conm FROM compustat GROUP BY conm; quit; PROC FREQ; TABLE uniquecomp_conm; title "Number of unique cleaned company names (company) within conm"; run; *There is one company value per each conm ; *** B4. Number of unique companies by industry groups (1-digit NAICS); PROC SQL; CREATE TABLE compustat AS SELECT *, count(distinct company) AS ind_uniquecomp FROM compustat GROUP BY naics1; quit; PROC MEANS mean missing nonobs maxdec=0; VAR ind_uniquecomp; CLASS naics1 ; title "Number of unique companies for each 1-digit NAICS industry"; run; /* 1. AGR 450 unique companies, etc. */ *** B5. Frequency of each company. - The sample period is 2001 - 2020, but not every company is alive in all 20 years. Count the frequency of each company. Then tabulate how many companies appear once, twice, …, 20 times. ; PROC SQL; CREATE TABLE compustat AS SELECT *, count(year) AS compfreq FROM compustat GROUP BY company; quit; PROC FREQ; TABLE compfreq; title "Frequency of each company name (company) - Number of years it appears"; run; *** B6. Company size statistics by industry groups - For each 1-digit NAICS industry, calculate the mean, standard deviation, maximum, and minimum company size based on employment. - Tabulate this in 4x9 table (5 x 10 including the headings) showing each industry in each row. ; PROC SQL; CREATE TABLE compustat AS SELECT *, mean(emp) as sizemean_n1, std(emp) as sizesd_n1, max(emp) as sizemax_n1, min(emp) as sizemin_n1 FROM compustat GROUP BY naics1; quit; PROC MEANS data=compustat mean nonobs; VAR sizemean_n1 sizesd_n1 sizemax_n1 sizemin_n1; FORMAT sizemean_n1 sizesd_n1 8.2 sizemin_n1 sizemax_n1 8.0; CLASS naics1; title "Company size stats by industry groups"; run; *** B7. Company size statistics by industry groups and region - Calculate the mean, maximum, and minimum employment as above for each naics1 and region pair - Tabulate them in 12 x 9 table showing industry groups in rows and the regional stats in columns.; * Easy way; PROC MEANS data=compustat nonobs mean std min max maxdec=4; VAR emp; CLASS region naics1; output out=quest7; title "Company size stats by industry groups and regions, using PROC MEANS"; run; /* This needs a bit of editing, but minor. */ data quest7; set quest7 (drop=_type_ _freq_); if region eq . or naics1 eq . then delete; if _stat_ in ('MIN','MAX','MEAN'); if _stat_ ='MEAN' the ord=1; if _stat_='MAX' then ord=2; if _stat_='MIN' then ord=3; run; proc sort data=quest7; by naics1; run; proc transpose data=quest7 out=quest7_final (drop=_name_ _label_ ); by naics1; var emp; id region _stat_; run; proc print data=quest7_final; title'Company size statistics by industry groups and region'; run; title; *question 8; DATA compustat; SET class.Sas_ho_compustat_datafile; run; ** Make naics1 and assign the value label; DATA compustat; SET compustat; naics1=INPUT(substr(naics,1,1), 1.); FORMAT naics1 naics1_lbl.; LABEL naics1 = "1-digit NAICS industry"; run; *Employment growth statistics; *Calculate the annual employemnet growth using the lagged variable; PROC MEANS nonobs sum maxdec=4; VAR emp; CLASS year; title "annual employemnet growth using the lagged variable"; run; PROC MEANS data=compustat nonobs sum mean maxdec=4; VAR emp; CLASS year naics1; title " industry-level average annual employment growth for the whole sample period and for each year. "; run; proc sort data=compustat ; by year; run; proc means data=compustat nonobs mean maxdec=4 noprint; var emp; class year naics1; where year in (2000,2002,2004,2006,2008,2010,2012,2014,2016,2018,2020); output out=stat1; run; data stat2; set stat1; if year eq . or naics1 eq . then delete; if _stat_='MEAN'; run; proc sort data=stat2; by naics1; run; proc transpose data=stat2 out=stat3 (drop=_name_ _label_ ); by naics1; var emp; id year; run; proc print data=stat3; title'Even number years and the sample period'; run; title; ** THE END of the handout ** * Save the output as a PDF file; ODS PDF close;