####################################################################
# divide_data.py
#   used to do first break-apart of lamarc data into pieces for
#   "poor man's parallelization"
####################################################################
# system imports
import os.path
import sys
from xml.dom.minidom import parse, Document

# imports 
import parallelCommon


# input options
[lamarcfile,lamdir,pydir] = parallelCommon.getOptionsAndVerify(True)

parallelCommon.describeThisScript("divide_data.py","divide regions and replicates",lamarcfile,lamdir)

inlamarc = open(lamarcfile,'r')
lamDom = parse(inlamarc)
inlamarc.close()

# find replicate count.
lamarcTag = parallelCommon.getFirstTag(lamDom,"lamarc")
chainsTag = parallelCommon.getFirstTag(lamarcTag,"chains")
replicatesTag = parallelCommon.getFirstTag(chainsTag,"replicates")
originalReplicateCount = 1
if replicatesTag:
    originalReplicateCount = parallelCommon.getLongVal(replicatesTag)

# find region count
dataElem = parallelCommon.getSingleTag(lamarcTag,"data")
regionElems = dataElem.getElementsByTagName("region")
regionCount = len(regionElems)

# bail if regions and replicates are both singletons
if ( regionCount == 1) and (originalReplicateCount == 1):
    print "ERROR: lamarc input file %s has only one region" % lamarcfile
    print "       and only one replicate. It cannot be broken down any further"
    sys.exit(2)


# add comment to identify outfile as generated by this script
commentNode = lamDom.createComment("Created by divide_data.py")
lamarcTag.insertBefore(commentNode,lamarcTag.firstChild)

# write out full copy using writexml. we do this so we can compare
# the original input file content in the same formatting we're using
# to generate the sub-files
lamcopy = os.path.join(lamdir,"infile_copy.xml")
outf = open(lamcopy,'w')
lamDom.writexml(outf)
outf.close()

# don't do profiles until very last run
parallelCommon.turnProfilesOff(lamDom,lamarcTag)


# change replicate number to 1 if it exists
if replicatesTag:
    parallelCommon.setVal(replicatesTag,"1")

# find format tag
formatTag = parallelCommon.getFirstTag(lamarcTag,"format")

# disconnect regions from dom -- we'll be putting them back
# in later one at a time
for region in regionElems:
    dataElem.removeChild(region)

# will store names of single region/single replicate lamarc files
infileList = []

# output each region file
regCount = 0
for region in regionElems:
    dataElem.appendChild(region)
    for repCount in range(originalReplicateCount):
        idString = 'reg%d_rep%d' % (regCount,repCount)
        if (originalReplicateCount == 1):
            idString = 'reg%d' % regCount

        parallelCommon.fixFormatTag(lamDom,formatTag,idString,False)

        runDir = os.path.join(lamdir,'%s' % idString)
        if not os.path.exists(runDir):
            os.makedirs(runDir)
        regFile = os.path.join(runDir,'infile_%s.xml' % idString)
        outf = open(regFile,'w')
        lamDom.writexml(outf)
        outf.close()
        infileList.append(regFile)

    # remove the region tag so next file output doesn't include it
    dataElem.removeChild(region)

    regCount = regCount + 1

    # EWFIX -- unlink region to save memory ??
    

# output instructions: files to run, next program to run
parallelCommon.nextStep(lamdir,infileList,False)
if(originalReplicateCount == 1):
    parallelCommon.finalStep(pydir,"combine_regions.py",lamarcfile,lamdir)
else:
    parallelCommon.finalStep(pydir,"combine_replicates.py",lamarcfile,lamdir)
