SAS Migration to Python or other languages

Semantic Designs can provide your organization with highly accurate automated conversion of legacy SAS applications (with embedded SQL) to modern technologies based on Python (or other languages such as Java, C#, or Julia (growing popularity).

Migrating SAS to new languages provides some nice properties:

  • Makes the application more maintainable, and easier to enhance due to the enormous libraries available.
  • Performance may improved just by translating. Python is notable for efficient math libraries such as pandas; translation to Java or C# enables translated code to be compiled. Julia is used in supercomputing applications because of its performance.
  • Moves it to modern technology enhancing developer productivity often enabled by better tools and IDEs.
  • Provides the organization with access to more easily found (Python, Java, etc.) software engineers. SAS talent is rapidly becoming rare, and new developers see no point in learning it.
all which contribute to faster turnaround on desired changes for management.

A key concern is, What will translated code look like? Weak translators convert code blindly line-by-line, with virtually no understanding of how the context of the code should shape its translation. Such translated code preserves the properties of the original SAS, sometime the point of even preserving the syntax. Even if such code works, it not be maintainable and will ultimately cost the organization sorely.

Semantic Designs translators parse the original code with tools that have the same understanding as the original compilers; (for example, see our SAS Front End). SAS doesn't really provide declarations of variable types, so SD's tools need to infer appropriate types (e.g., arrays of numbers, strings, record structures) based on how variables are used in the program, where and how they are used ("the context"). Using this information the translator can thus produce extremely good translations. Because SD also customizes the tools to the specific needs of the customer, the code reflects those needs properly.

A SAS program translation

Here we show you a small sample of SAS translated to Python by SD's translation technology. Immediately below you can see the sample SAS program. Click here to see the the Python code generated.

A simple SAS program

We have chosen to provide a small (166 line) SAS program to prevent the reader from being overwhelmed by the complexities that occur in real examples.

*options nosource nonotes;
*options nosymbolgen nomprint nomlogic;

options source notes;
options symbolgen mprint mlogic;
%let MAX_PQ = 10;
%put  all;

%let ldir      = c:/sas2python/data;
%let ldata     = sample_demo;
libname ldir   "&ldir";
%let odir      = c:/sas2python/results/python;
libname rout   "&odir";
%let cfiles    = X1;
%let ctypes    = DEMO;
%let cids      = Y1;

%let lrate     = 0.0023;
%let rrate     = 0.2300;
%let RUNDATE   = 08/01/2020;
%let FEXEC     = X1;
%let FTYPE     = ABC;
%let FITER     = QTR;

%let COMPTYPE  = MR;
%let MODID     = R01;
%let SCEN      = B01;
%let MVERSION  = 1.0;
%let COMPDIR   = c:/sas2python/results/python;

data work_set;
    set ldir.&ldata;
    keep past_due_days 
        AMT GRP_ID GRP_NAME END_DATE;
run;

data work_set_fc;
    length RUNDATE FEXEC FTYPE FITER RUN_ID TASKID COMPID COMPTYPE
        MODID SCEN MVERSION $50 GRP_ID 8 GRP_NAME $100 
        END_DATE 8 SCODE ;
    set work_set;
    RUNDATE = upcase("&RUNDATE");
    FEXEC = upcase("&FEXEC");
    FTYPE = upcase("&FTYPE");
    FITER = upcase("&FITER");

    COMPTYPE = upcase ("&ctypes");

    array pq_array(&MAX_PQ) PQ1-PQ&MAX_PQ;
    array balance(&MAX_PQ) balance1-balance&MAX_PQ;
    
    do i = 1 to &MAX_PQ;
        if i = 1 then
            balance(i) = T0;
        else  balance(i) = pq_array(i - 1);

        pq_array(i) = balance(i) * .0001;
    end;

    output;

    MEASURE = 'NET_LOSS';
    T0= .;
    do i = 1 to &MAX_PQ;
        if balance(i) ne . then 
            if past_due_days >= 120 then
            pq_array(i) = balance(i) * (1 - &rrate);
        else  
            pq_array(i) = balance(i) * &lrate;
    end;

    output;
    
    keep RUNDATE FEXEC FTYPE FITER COMPTYPE 
        MODID SCEN MVERSION GRP_ID GRP_NAME END_DATE ;

run;

/*if VARNAME not defined, assign it with VARVAL */
%macro set_macro_var(VARNAME, VARVAL);
    %if not %symexist (&VARNAME) %then
        %do;
            %global &VARNAME;
            %let &VARNAME = &VARVAL;
        %end;
%mend set_macro_var;

data work_set_fc0;
    set work_set_fc;
    array pq_array (&MAX_PQ) PQ1-PQ&MAX_PQ;

    do i = 1 to &MAX_PQ;
        if pq_array(i) = . then
            pq_array(i) = 0;
        pq_array(i) = round (pq_array(i), 0.01);
    end;

    drop i;
run;

data all_bal;
    set work_set_fc0;
    where measure in ('MODIFIED');
    keep T0;
run;

proc sql noprint;
    select sum(t0) format = best32. 
        into :TOTAL_BAL 
        from all_bal;
quit;

%put Total balance = [%sysfunc(strip(&TOTAL_BAL))];

data all_loss;
    set work_set_fc0;
    where measure in ('NET_LOSS');
    total_loss = sum(of PQ:);

    keep total_loss;
run;

proc sql noprint;
    select sum(total_loss) format = best32., sum(total_loss2) format = best32. 
        into :TOTAL_LOSS, :TOTAL_LOSS2 
            from all_loss;
quit;

data summary;
    length SCEN $50 category notes $50 value 8;

    SCEN  = 'Beginning';
    notes = '';
    category = 'Total Balance';
    value = &TOTAL_BAL;
    output;
    category = 'Loss';
    value = &TOTAL_LOSS;
run;

PROC SORT DATA = work_set_fc0;
     BY UID;
RUN;

libname lib_comp "&COMPDIR";
%let OUT_FILE = &COMPDIR/model.txt;

proc export data = work_set_fc0
    outfile = "&OUT_FILE" 
    dbms = dlm replace;
    delimiter = '|';
run;

data lib_comp.pure_model;
    set work_set_fc0;
run;

%set_macro_var(qin, 12);
%set_macro_var(qout, 20);
%set_macro_var(NUM_SCEN, 2);


proc export data = summary 
    outfile =  "&COMPDIR/summary.csv" 
    dbms = csv replace;
run;

Each client has different goals and these can change the chosen target technology.

  • Target language of Python (widely accepted), Java or C#, or Julia.
  • Mapping of SAS concepts, especially those unique to SAS PROCs, to specific efficient code fragments or target languages libraries (e.g., SQL PROCs to NHibernate for C#, or a Python ORM for a Python target.
  • Mapping of database queries to the client's database of choice; this may be complicated by the fact that SQL only appears to be standard across databases.

One should note that the translator should chose fixed data types for variables compatible with their usage in the SAS program or the database schema, but with a different or detailed representation better suited for for the target language.

Which combination of choices is made can determine very different style of translated code. The translator needs to be configured to account for the client's target technology choices.

SAS program automatically translated to Python

This output is reproduced exactly as SD's translator generated it for Python. The translator produces nicely formatted target language code as a standard by-product.

Click here to see original SAS program

# Translated by Semantic Designs SAS~V9R4 to Python~v3_1 Translator V1.0.20200221
import math
import pandas
import warnings
warnings.filterwarnings("ignore")
def _symexist(s):
    return "_"+s.upper() in globals()
def _symget(s):
    try:
        return globals()["_"+s.upper()]
    except KeyError:
        return None
def _symput(s, v):
    globals()["_"+s.upper()] = v
def _symputx(s, v):
    globals()["_"+s.upper()] = v.strip() if isinstance(v, str) else v
_MAX_PQ = 10
_LDIR = "c:/sas2python/data"
_LDATA = "sample_demo"
ldir = _LDIR
_ODIR = "c:/sas2python/results/python"
rout = _ODIR
_CFILES = "X1"
_CTYPES = "DEMO"
_CIDS = "Y1"
_LRATE = 0.0023
_RRATE = 0.2300
_RUNDATE = "08/01/2020"
_FEXEC = "X1"
_FTYPE = "ABC"
_FITER = "QTR"
_COMPTYPE = "MR"
_MODID = "R01"
_SCEN = "B01"
_MVERSION = 1.0
_COMPDIR = "c:/sas2python/results/python"
# <data step: 31>
currentDataFrame = pandas.read_sas(ldir+"/"+_LDATA+".sas7bdat", format="sas7bdat", encoding="ISO-8859-1")
currentDataFrame = currentDataFrame[["PAST_DUE_DAYS", "AMT", "GRP_ID", "GRP_NAME", "END_DATE"]]
work_set = currentDataFrame
del currentDataFrame
# <end>
# <data step: 37>
output = []
currentDataFrame = work_set.copy()
currentDataFrame["RUNDATE"] = _RUNDATE.upper()
currentDataFrame["FEXEC"] = _FEXEC.upper()
currentDataFrame["FTYPE"] = _FTYPE.upper()
currentDataFrame["FITER"] = _FITER.upper()
currentDataFrame["COMPTYPE"] = _CTYPES.upper()
for i in range(1, _MAX_PQ+1):
    if "PQ"+str(i) not in currentDataFrame.columns:
        currentDataFrame["PQ"+str(i)] = None
for i in range(1, _MAX_PQ+1):
    if "BALANCE"+str(i) not in currentDataFrame.columns:
        currentDataFrame["BALANCE"+str(i)] = None
for currentRow in currentDataFrame.index:
    for i in range(1, _MAX_PQ+1):
        if i == 1:
            currentDataFrame["BALANCE"+str(i)][currentRow] = currentDataFrame["T0"][currentRow]
        else:
            currentDataFrame["BALANCE"+str(i)][currentRow] = currentDataFrame["PQ"+str(i-1)][currentRow]
        currentDataFrame["PQ"+str(i)][currentRow] = currentDataFrame["BALANCE"+str(i)][currentRow]*.0001
output += [currentDataFrame.copy()]
currentDataFrame["MEASURE"] = "NET_LOSS"
currentDataFrame["T0"] = None
for currentRow in currentDataFrame.index:
    for i in range(1, _MAX_PQ+1):
        if currentDataFrame["BALANCE"+str(i)][currentRow] is not None:
            if currentDataFrame["PAST_DUE_DAYS"][currentRow] >= 120:
                currentDataFrame["PQ"+str(i)][currentRow] = currentDataFrame["BALANCE"+str(i)][currentRow]*(1-_RRATE)
            else:
                currentDataFrame["PQ"+str(i)][currentRow] = currentDataFrame["BALANCE"+str(i)][currentRow]*_LRATE
output += [currentDataFrame.copy()]
currentDataFrame = pandas.concat(output, ignore_index=True, sort=False)
currentDataFrame = currentDataFrame[currentDataFrame.columns.intersection(["RUNDATE", "FEXEC", "FTYPE", "FITER", "COMPTYPE", "MODID", "SCEN", "MVERSION", "GRP_ID", "GRP_NAME", "END_DATE"])]
work_set_fc = currentDataFrame
del output, currentDataFrame
# <end>
def SET_MACRO_VAR(_VARNAME, _VARVAL):
    if not _symexist(_VARNAME):
        _symput(_VARNAME, _VARVAL)
# <data step: 88>
currentDataFrame = work_set_fc.copy()
for i in range(1, _MAX_PQ+1):
    if "PQ"+str(i) not in currentDataFrame.columns:
        currentDataFrame["PQ"+str(i)] = None
for currentRow in currentDataFrame.index:
    for i in range(1, _MAX_PQ+1):
        if currentDataFrame["PQ"+str(i)][currentRow] is None:
            currentDataFrame["PQ"+str(i)][currentRow] = 0
        currentDataFrame["PQ"+str(i)][currentRow] = round(currentDataFrame["PQ"+str(i)][currentRow], 2)
currentDataFrame = currentDataFrame.drop(currentDataFrame.columns.intersection(["I"]), axis=1)
work_set_fc0 = currentDataFrame
del currentDataFrame
# <end>
# <data step: 101>
currentDataFrame = work_set_fc0.copy()
currentDataFrame = currentDataFrame[currentDataFrame["MEASURE"].isin(["MODIFIED"])]
currentDataFrame = currentDataFrame[["T0"]]
all_bal = currentDataFrame
del currentDataFrame
# <end>
# <proc step (sql): 107>
currentTable = all_bal
_TOTAL_BAL = currentTable["T0"].sum()
del currentTable
# <end>
# <data step: 115>
currentDataFrame = work_set_fc0.copy()
currentDataFrame = currentDataFrame[currentDataFrame["MEASURE"].isin(["NET_LOSS"])]
currentDataFrame["TOTAL_LOSS"] = currentDataFrame[[c for c in currentDataFrame if c.startswith("PQ")]].sum(axis=1)
currentDataFrame = currentDataFrame[["TOTAL_LOSS"]]
all_loss = currentDataFrame
del currentDataFrame
# <end>
# <proc step (sql): 123>
currentTable = all_loss
_TOTAL_LOSS, _TOTAL_LOSS2 = currentTable["TOTAL_LOSS"].sum(), currentTable["TOTAL_LOSS2"].sum()
del currentTable
# <end>
# <data step: 129>
output = []
currentDataFrame = pandas.DataFrame(index=[0])
currentDataFrame["SCEN"] = "Beginning"
currentDataFrame["NOTES"] = ""
currentDataFrame["CATEGORY"] = "Total Balance"
currentDataFrame["VALUE"] = _TOTAL_BAL
output += [currentDataFrame.copy()]
currentDataFrame["CATEGORY"] = "Loss"
currentDataFrame["VALUE"] = _TOTAL_LOSS
currentDataFrame = pandas.concat(output, ignore_index=True, sort=False)
summary = currentDataFrame
del output, currentDataFrame
# <end>
# <proc step (sort): 141>
work_set_fc0.sort_values(by="UID", ascending=True, inplace=True, ignore_index=True)
# <end>
lib_comp = _COMPDIR
_OUT_FILE = _COMPDIR+"/model.txt"
# <proc step (export): 148>
work_set_fc0.to_csv(_OUT_FILE, sep="|", index=False, header=True, encoding="utf-8")
# <end>
# <data step: 154>
work_set_fc0.to_pickle(lib_comp+"/"+"pure_model.pkl.xz")
# <end>
SET_MACRO_VAR("qin", 12)
SET_MACRO_VAR("qout", 20)
SET_MACRO_VAR("NUM_SCEN", 2)
# <proc step (export): 163>
summary.to_csv(_COMPDIR+"/summary.csv", index=False, header=True, encoding="utf-8")
# <end>

Click here to see original SAS program

One can get good code like the above, from a translator, if one has the right foundation technology such as SD's DMS Software Reengineering Tookit®. As a practical matter, you can't get good code from an off the shelf translator, because every application system has unique properties: languages, OS features, scripting languages, screens, databases, and a correspondingly unique set of target technologies as decided by the client. It takes some effort to configure DMS for the client's particular source and target software configurations. The effort is rewarded by the maintainablity of the result.

Configuring a custom migration tool to match client needs

Semantic Designs can provide custom configuration of migration tools and migration support to your organization, providing high quality, maintainable code translations.

For more information: [email protected]    Follow us at Twitter: @SemanticDesigns

SAS
Migration