/************************************************************************************************************; This program is to assist the NAACCR community in reading and writing NAACCR XML files using SAS. This program is to be used in conjunction with the Word document, Instructions for ReadWrite_NAACCR_21_XML_tidy.sas_20210219.docx. It harnesses SAS code and macros written by Fabian Depry, IMS, adds SAS labels, and removes fields from the SAS datasets that are 100% missing. ************************************************************************************************************/; *** STEP 1. Specify the folder location that SAS will use to write HTML output files. ***; ods html path="your-path" gpath="your-path"(URL=none); ods html newfile=proc; *** STEP 2. Use Fabian's tools to Read an XML file into SAS. *** Specify the locaton of the read_naaccr_xml_macro.sas, the path for the SAS jar file, the source XML data file ***; *** (this can read GZIP or unzipped XML files), and the CSV version of the user-defined data dictionary (if there is one). ***; /************************************************************************************************************; Original Comments from Fabian's "read" example: This programs demonstrates how to include and call the "read_naaccr_xml_macro.sas" macro. This example assumes that the JAR file is in the same folder as this program. Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated. Commented out in the code below, this example includes two items, meaning that only those two items will be included in the resulting date set. That parameter is optional and if not provided, the data set will contain all standard items plus any non-standard items provided via the extra dictionary. Be aware that creating a data set containing all items will be MUCH slower than creating one for just a few items, and so if you only need a handful of items to do your analysis, it is strongly recommended to provide those items (you can check the official NAACCR documentation (http://datadictionary.naaccr.org/default.aspx?Version=21) to find the NAACCR XML IDs to use in that list). This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary should have been provided by the organization that created the XML data file. Dictionaries are usually in XML format, but for technical reasons, the macro expects them in CSV files; the NAACCR XML Tool that is distributed with the macros has an option to load a dictionary and save it as CSV. The Word document describes how to do this in detail. ************************************************************************************************************/; %include "your-path\naaccr-xml-utility-7.8\sas\read_naaccr_xml_macro.sas"; %readNaaccrXml( libpath="your-path\naaccr-xml-utility-7.8\sas\naaccr-xml-7.8-sas.jar", sourcefile="your-path\your-naaccr-xml-data-file.xml.gz", naaccrversion="210", recordtype="I", dataset=fromxml, /* items="patientIdNumber,primarySite", */ dictfile="your-path\your-user-defined-naaccr-xml-dictionary.csv" ); proc contents data=fromxml position; run; *** STEP 3. Create SAS variable labels. ***; *** a. Open the NAACCR XML Utility.exe, go to the Standard Dictionaries, and choose the NAACCR 21 base dictionary. Extract to CSV. ***; *** b. In the 'filename dicts...' below, put the path and name of the standard dictionary CSV that you extracted on the first line. ***; *** c. In the 'filename dicts...' below, put the path and name of the user-defined dictionary CSV that you extracted in Step 4 of the ***; *** Word document instructions on the second line. If you do not have a user-defined dictionary, leave this blank. ***; *** d. In the 'filename labels...' below, put the path where you want to store a text file of SAS labels statements. This will be ***; *** used in Step 4 to label the SAS variables. ***; filename dicts ("your-path\naaccr-xml-utility-7.8\base-naaccr-dictionary-210.csv" "your-path\your-user-defined-naaccr-xml-dictionary"); filename labels "your-path\label-statements.txt"; data dict2labels; infile dicts delimiter = ',' MISSOVER DSD lrecl=32767 firstobs=2; format NAACCR_XML_ID $32. NAACCR_Number best12. Name $100. Start_Column best12. Length best12. Record_Types $20. Parent_XML_Element $20. Data_Type $20. labelstatement $200.; input NAACCR_XML_ID $ NAACCR_Number Name $ Start_Column $ Length Record_Types $ Parent_XML_Element $ Data_Type $; labelstatement = "label "||strip(NAACCR_XML_ID)||" = '"||strip(NAACCR_Number)||"_"||strip(Name)||" - "||strip(NAACCR_XML_ID)||" - "||strip(Parent_XML_Element)||"';"; if NAACCR_XML_ID='NAACCR XML ID' then delete; file labels; put labelstatement; run; proc print data=dict2labels noobs n; var labelstatement; run; *** STEP 4. This is the TIDY part of the code that labels the SAS variables and drops variables that are 100% missing. If you looked ***; *** at the proc contents results from Step 2, you may have noticed that every single variable in the NAACCR V21 data dictionary is included, ***; *** even though some of the variables were not included in the data file. ***; *** The SAS Log will show notes for the variables not in the data file, but that you tried to label, indicating that the variable is ***; *** uninitialized, for example "NOTE: Variable textStaging is uninitialized." Don't worry about these notes. :) ***; data xml_labeled; set fromxml; %include labels; run; * Get list of variables ; proc transpose data=xml_labeled (obs=0) out=names ; var _all_; run; * Generate code to count non-missing values ; filename code temp; data _null_; file code ; set names end=eof; if _n_=1 then put 'proc sql noprint;' /'create table _counts as select' /' ' @ ; else put ',' @ ; put 'sum(not missing(' _name_ ')) as ' _name_ ; if eof then put 'from xml_labeled' /';' /'quit;' ; run; * Run generated code ; %include code / source2 ; * Generate DROP statement ; filename code temp; data _null_; set _counts ; array c _numeric_; file code lrecl=80 ; length _name_ $32 ; put 'drop ' @; do _n_=1 to dim(c); if c(_n_)=0 then do ; _name_ = vname(c(_n_)); put _name_ @ ; end; end; put ';' ; run; * Make version of data without empty variables ; data xml_labeled_tidy ; set xml_labeled; %include code / source2; run; proc contents data=xml_labeled_tidy position; run; *** STEP 5. You can now work with the SAS dataset "xml_labeled_tidy." You may wish to save it as a permanent SAS dataset (in a folder) ***; *** or modify the code below to review the frequencies of variables. You can modify data items, like remove the day components of dates. ***; *** You can do statistical analysis, etc. ***; *** The example below is for the Feb 2021 SEER Data Submission, which includes a subset of variables from the standard NAACCR dictionary ***; *** and additional user-defined variables. ***; title1 'FREQUENCIES FOR NAACCR STANDARD DATA ITEMS AND NON-STANDARD DATA ITEMS'; proc freq data=xml_labeled_tidy; format patientIdNumber $2.; tables _ALL_; run; title1 'FREQUENCIES FOR NON-STANDARD DATA ITEMS'; proc freq data=xml_labeled_tidy; tables seerSeerSiteRecode oncotypeDxPrstScore oncotypeDxPrstRiskGroup oncotypeDxPrstRiskGroup1317 oncotypeDxPrstReasonNoScore oncotypeDxPrstReportDate oncotypeDxRecurrenceScore oncotypeDxRsRiskGroup oncotypeDxRsReasonNoScore oncotypeDxRsTestReportDate oncotypeDxDcisScore oncotypeDxDcisRiskGroup oncotypeDxDcisReasonNoScore oncotypeDxDcisTestReportDate; *caTumorId; run; *** STEP 6. Use Fabian's tools to Write an XML file from SAS. ***; *** Specify the locaton of the write_naaccr_xml_macro.sas, the path for the SAS jar file, the target XML file, ***; *** the dataset, and the CSV version of the user-defined data dictionary (the same that you used above). ***; /************************************************************************************************************; Original Comments from Fabian's "write" example: This programs demonstrates how to include and call the "write_naaccr_xml_macro.sas" macro. While it's possible to use the write macro without the read one, they are really meant to be used together. This example assumes that the JAR file is in the same folder as this program. Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated. This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary should have been provided by the organization that created the XML data file. Dictionaries are usually in XML files, but for technical reasons, the macro expects them in CSV format; the NAACCR XML Tool that is distributed with the macros has an option to load a dictionary and save it as CSV. For writing proper XML files, the macro also needs the dictionary URI; since the CSV format doesn't contain that URI, it needs to be provided as a parameter. The URI can be found as a root attribute of the XML dictionary (it usually looks like an internet address, but it's rarely a legit address). ************************************************************************************************************/; %include "your-path\naaccr-xml-utility-7.8\sas\write_naaccr_xml_macro.sas"; %writeNaaccrXml( libpath="your-path\naaccr-xml-utility-7.8\sas\naaccr-xml-7.8-sas.jar", targetfile="your-path\your-output-naaccr-xml-data-file.xml.gz", naaccrversion="210", recordtype="I", dataset=fromxml, /* items="patientIdNumber,primarySite", */ dictfile="your-path\your-user-defined-naaccr-xml-dictionary.csv", dicturi="https://your-user-defined-naaccr-xml-dictionary-url.xml" ); *** STEP 7. FOR TESTING ONLY. *** FOR COMPARISON OF READ AND WRITE STATEMENTS ***; *** a. Read in the XML you just wrote. ***; *%include "your-path\naaccr-xml-utility-7.8\sas\read_naaccr_xml_macro.sas"; *%readNaaccrXml( libpath="your-path\naaccr-xml-utility-7.8\sas\naaccr-xml-7.8-sas.jar", sourcefile="your-path\your-output-naaccr-xml-data-file.xml.gz", naaccrversion="210", recordtype="I", dataset=fromxml2, /* items="patientIdNumber,primarySite", */ dictfile="your-path\your-user-defined-naaccr-xml-dictionary.csv" ); *title1 "CONTENTS OF STANDARD AND NON-STANDARD DATA ITEMS IN 'sourcefile' above"; *proc contents data=fromxml2 position; *run; *** b. Compare the original XML file as read by SAS with the one that was written. ***; *proc sort data=fromxml; * by addrAtDxState patientIdNumber sequenceNumberCentral; *run; *proc sort data=fromxml2; * by addrAtDxState patientIdNumber sequenceNumberCentral; *run; *title1 "RESULTS FROM COMPARISON OF ORIGINAL XML FILE and XML FILE THAT WAS WRITTEN BY SAS"; *proc compare base=fromxml compare=fromxml2 maxprint=(20,10000) LISTBASEOBS LISTCOMPOBS; * id addrAtDxState patientIdNumber sequenceNumberCentral; *run;