TuttleOFX  1
analyze.cpp
Go to the documentation of this file.
00001 #include "analyze.hpp"
00002 
00003 #include "detail/FileNumbers.hpp"
00004 #include "detail/FileStrings.hpp"
00005 #include "commonDefinitions.hpp"
00006 
00007 #include <boost/regex.hpp>
00008 #include <boost/unordered_map.hpp>
00009 #include <boost/lambda/lambda.hpp>
00010 #include <boost/foreach.hpp>
00011 #include <set>
00012 
00013 
00014 namespace sequenceParser {
00015 
00016 using detail::FileNumbers;
00017 using detail::FileStrings;
00018 namespace bfs = boost::filesystem;
00019 
00020 bool detectDirectoryInResearch( std::string& researchPath, std::vector<std::string>& filters, std::string& filename )
00021 {
00022         if( bfs::exists( researchPath ) )
00023         {
00024                 if( !bfs::is_directory( researchPath ) )
00025                 {
00026                         // the researchPath is an existing file, we search into the parent directory with filtering these filename
00027                         // warning: can find a sequence based on a filename
00028                         bfs::path tmpPath( researchPath );
00029                         filename = researchPath;
00030 
00031                         if( tmpPath.has_parent_path() )
00032                                 researchPath = tmpPath.parent_path().string();
00033                         else
00034                                 researchPath = ".";
00035                 }
00036                 // else
00037                 // the researchPath is a directory, we search into the directory without filtering
00038         }
00039         else
00040         {
00041                 bfs::path tmpPath( researchPath );
00042                 if( !tmpPath.has_parent_path() )
00043                 {
00044                         filters.push_back( researchPath );
00045                         researchPath = ".";
00046                         return true;
00047                 }
00048                 bfs::path parentPath( tmpPath.parent_path() );
00049                 if( !bfs::exists( parentPath ) )
00050                 {
00051                         // researchPath and it parent don't exists, could not find file/sequence/folder
00052                         return false;
00053                 }
00054                 // the researchPath is not a directory, but it parent is a directory
00055                 // we search in this parent directory, with the filtering pattern
00056                 filters.push_back( tmpPath.filename().string() );
00057                 researchPath = parentPath.string();
00058         }
00059         return true;
00060 }
00061 
00062 Sequence privateBuildSequence(
00063                 const Sequence& defaultSeq,
00064                 const FileStrings& stringParts,
00065                 const std::vector<FileNumbers>::const_iterator& numberPartsBegin,
00066                 const std::vector<FileNumbers>::const_iterator& numberPartsEnd,
00067                 const std::size_t index,
00068                 const std::size_t padding,
00069                 const bool strictPadding
00070         )
00071 {
00072         const std::size_t len = numberPartsBegin->size();
00073         Sequence sequence( defaultSeq );
00074 
00075         // fill information in the sequence...
00076         for( std::size_t i = 0; i < index; ++i )
00077         {
00078                 sequence._prefix += stringParts[i];
00079                 sequence._prefix += numberPartsBegin->getString( i );
00080         }
00081         sequence._prefix += stringParts[index];
00082         for( std::size_t i = index + 1; i < len; ++i )
00083         {
00084                 sequence._suffix += stringParts[i];
00085                 sequence._suffix += numberPartsBegin->getString( i );
00086         }
00087         sequence._suffix += stringParts[len];
00088 
00089         std::vector<FileNumbers>::const_iterator numberPartsLast = numberPartsEnd;
00090         --numberPartsLast;
00091 
00092         // standard case, one sequence detected
00093         sequence._firstTime = numberPartsBegin->getTime( index );
00094         sequence._lastTime = numberPartsLast->getTime( index );
00095         sequence._nbFiles = std::distance( numberPartsBegin, numberPartsEnd );
00096 
00097         sequence.extractStep( numberPartsBegin, numberPartsEnd, index );
00098         //sequence.extractPadding( numberPartsBegin, numberPartsEnd, index );
00099         sequence._padding = padding;
00100         sequence._strictPadding = strictPadding;
00101         //sequence.extractIsStrictPadding( numberPartsBegin, numberPartsEnd, index, sequence._padding );
00102 
00103         return sequence;
00104 }
00105 
00106 /**
00107  *
00108  * @param result
00109  * @param numberPartsBegin
00110  * @param numberPartsEnd
00111  * @param index
00112  */
00113 void privateBuildSequencesAccordingToPadding(
00114         std::vector<Sequence>& result,
00115         const Sequence& defaultSeq,
00116         const FileStrings& stringParts,
00117         const std::vector<FileNumbers>::iterator& numberPartsBegin,
00118         const std::vector<FileNumbers>::iterator numberPartsEnd,
00119         const int index )
00120 {
00121         std::set<std::size_t> paddings;
00122         std::set<std::size_t> ambiguousPaddindDigits;
00123         for( std::vector<FileNumbers>::const_iterator it = numberPartsBegin;
00124              it != numberPartsEnd;
00125                  ++it )
00126         {
00127                 const std::size_t padding  = it->getPadding(index);
00128                 const std::size_t nbDigits = it->getNbDigits(index);
00129 
00130                 paddings.insert( padding );
00131 
00132                 if( padding == 0 )
00133                 {
00134                         ambiguousPaddindDigits.insert( nbDigits );
00135                 }
00136         }
00137 
00138         if( paddings.size() == 1 )
00139         {
00140                 // standard case: only one padding used in the sequence!
00141                 const std::size_t p = *paddings.begin();
00142                 // simple sort
00143                 std::sort( numberPartsBegin, numberPartsEnd, FileNumbers::SortByNumber() );
00144                 result.push_back( privateBuildSequence( defaultSeq, stringParts, numberPartsBegin, numberPartsEnd, index, p, (p!=0) ) );
00145                 return;
00146         }
00147 
00148         bool onlyConsiderPadding = false;
00149         if( paddings.find( 0 ) == paddings.end() )
00150         {
00151                 // No element without padding.
00152                 // All parts are prefixed by 0, only strict padding,
00153                 // so we can sort by padding without ambiguity
00154                 onlyConsiderPadding = true;
00155         }
00156         else
00157         {
00158                 // We have a mix of padding and no padding.
00159                 // It may be the same number of digits (strict or ambiguous padding: [09, 10]).
00160                 // Some ambiguous cases:
00161                 //      --------------------------------------------------------------------------------
00162                 //      |     sort by padding      |     sort by padding     |     sort by digits      |
00163                 //      --------------------------------------------------------------------------------
00164                 //      |  number  padding digits  |  number padding digits  |  number padding digits  |
00165                 //      |  1       0       1       |  100    0       3       |  1      0       1       |
00166                 //      |  5       0       1       |  102    0       3       |  5      0       1       |
00167                 //      |  10      0       2       |                         |  10     0       2       |
00168                 //      |  100     0       3       |  001    3       3       |                         |
00169                 //      |  102     0       3       |  002    3       3       |  001    3       3       |
00170                 //      |  1000    0       4       |  099    3       3       |  002    3       3       |
00171                 //      |                          |                         |  099    3       3       |
00172                 //      |  001     3       3       |  0001   4       4       |  100    0       3       |
00173                 //      |  002     3       3       |  0123   4       4       |  102    0       3       |
00174                 //      |  099     3       3       |  1234   4       4       |                         |
00175                 //      |                          |                         |  0001   4       4       |
00176                 //      |  0001    4       4       |                         |  0123   4       4       |
00177                 //      |  0123    4       4       |                         |  1234   4       4       |
00178                 //      |  1234    4       4       |                         |                         |
00179                 //      |                          |                         |  10000  0       5       |
00180                 //      --------------------------------------------------------------------------------
00181                 //      |                          |  The sequence without   |   One sequence without  |
00182                 //      |   One sequence without   |  padding can be merge   | padding, should use a   |
00183                 //      |        padding           |   in sequence with      |     sort by padding     |
00184                 //      |                          |        padding 3        |                         |
00185                 //      --------------------------------------------------------------------------------
00186                 //      |          YES             |   NO : sort by digits   |  NO : sort by padding   |
00187                 //      --------------------------------------------------------------------------------
00188                 onlyConsiderPadding = false;
00189                 BOOST_FOREACH( const std::size_t dig, ambiguousPaddindDigits )
00190                 {
00191                         if( paddings.find( dig ) == paddings.end() )
00192                         {
00193                                 // if one digits from ambiguous digits doesn't correspond to
00194                                 // a padding... we keep the whole sequence without padding.
00195                                 onlyConsiderPadding = true;
00196                                 break;
00197                         }
00198                 }
00199         }
00200 
00201         if( onlyConsiderPadding )
00202         {
00203                 //std::cout << "Detector onlyConsiderPadding: " << __LINE__ << std::endl;
00204                 // sort by padding
00205                 std::sort( numberPartsBegin, numberPartsEnd, FileNumbers::SortByPadding() );
00206                 // split when the padding changed
00207                 std::vector<FileNumbers>::const_iterator start = numberPartsBegin;
00208                 for( std::vector<FileNumbers>::const_iterator it = boost::next(start); it != numberPartsEnd; ++it )
00209                 {
00210                         if( start->getPadding(index) != it->getPadding(index) )
00211                         {
00212                                 const std::size_t p = start->getPadding(index);
00213                                 result.push_back( privateBuildSequence( defaultSeq, stringParts, start, it, index, p, (p!=0) ) );
00214                                 start = it;
00215                         }
00216                 }
00217                 const std::size_t p = start->getPadding(index);
00218                 result.push_back( privateBuildSequence( defaultSeq, stringParts, start, numberPartsEnd, index, p, (p!=0) ) );
00219                 return;
00220         }
00221         else
00222         {
00223                 //std::cout << "Detector onlyConsiderDigits: " << __LINE__ << std::endl;
00224                 // sort by digits
00225                 std::sort( numberPartsBegin, numberPartsEnd, FileNumbers::SortByDigit() );
00226                 // split when the number of digits changed
00227                 std::vector<FileNumbers>::const_iterator start = numberPartsBegin;
00228                 for( std::vector<FileNumbers>::const_iterator it = boost::next(numberPartsBegin); it != numberPartsEnd; ++it )
00229                 {
00230                         if( start->getNbDigits(index) != it->getNbDigits(index) )
00231                         {
00232                                 const std::size_t p = boost::prior(it)->getPadding(index);
00233                                 const std::size_t pStart = start->getPadding(index);
00234                                 result.push_back( privateBuildSequence( defaultSeq, stringParts, start, it, index, pStart, (p!=pStart) ) );
00235                                 start = it;
00236                         }
00237                 }
00238                 const std::size_t p = boost::prior(numberPartsEnd)->getPadding(index);
00239                 const std::size_t pStart = start->getPadding(index);
00240                 result.push_back( privateBuildSequence( defaultSeq, stringParts, start, numberPartsEnd, index, pStart, (p!=pStart) ) );
00241                 return;
00242         }
00243 }
00244 
00245 
00246 bool getVaryingNumber( std::ssize_t& index, const FileNumbers& a, const FileNumbers& b )
00247 {
00248         BOOST_ASSERT( a.size() == b.size() );
00249         bool foundOne = false;
00250         for( std::size_t i = 0; i < a.size(); ++i )
00251         {
00252                 if( a.getString(i) != b.getString(i) )
00253                 {
00254                         if( foundOne )
00255                         {
00256                                 index = -1;
00257                                 return false; // more than one element founded
00258                         }
00259                         foundOne = true;
00260                         index = i;
00261                 }
00262         }
00263         if( !foundOne )
00264                 index = -1;
00265         return foundOne; // we found one varying index
00266 }
00267 
00268 std::vector<Sequence> buildSequences( const boost::filesystem::path& directory, const FileStrings& stringParts, std::vector<FileNumbers>& numberParts, const EMaskOptions& desc )
00269 {
00270         Sequence defaultSeq( directory, desc );
00271         BOOST_ASSERT( numberParts.size() > 0 );
00272         // assert all FileNumbers have the same size...
00273         BOOST_ASSERT( numberParts.front().size() == numberParts.back().size() );
00274         const std::size_t len = numberParts.front().size();
00275         std::vector<Sequence> result;
00276         
00277         if( numberParts.size() <= 1 )
00278         {
00279                 privateBuildSequencesAccordingToPadding( result, defaultSeq, stringParts, numberParts.begin(), numberParts.end(), len-1 );
00280                 return result;
00281         }
00282         
00283         // detect which part is the sequence number
00284         // for the moment, accept only one sequence
00285         // but we can easily support multi-sequences
00286         std::vector<std::size_t> allIndex; // vector of indices (with 0 < index < len) with value changes
00287         for( std::size_t i = 0; i < len; ++i )
00288         {
00289                 const std::string t = numberParts.front().getString( i );
00290 
00291                 BOOST_FOREACH( const FileNumbers& sn, numberParts )
00292                 {
00293                         if( sn.getString( i ) != t )
00294                         {
00295                                 allIndex.push_back( i );
00296                                 break;
00297                         }
00298                 }
00299         }
00300         
00301         //std::cout << "allIndex.size(): " << allIndex.size() << std::endl;
00302         
00303         if( allIndex.size() == 1 )
00304         {
00305                 // if it's a simple sequence, but may be mix multiple paddings
00306                 privateBuildSequencesAccordingToPadding( result, defaultSeq, stringParts, numberParts.begin(), numberParts.end(), allIndex.front() );
00307                 return result;
00308         }
00309         
00310         // it's a multi-sequence
00311         
00312         // ambiguous example
00313         // 1 2 3
00314         // 1 3 3
00315         // 1 4 3
00316         // 1 5 3 // could go in both sequences
00317         //// split here
00318         // 1 5 4
00319         // 1 5 5
00320         // 1 5 6
00321         // 1 5 7
00322         
00323         std::sort( numberParts.begin(), numberParts.end(), FileNumbers::SortByPadding() );
00324 
00325         std::vector<FileNumbers>::iterator start = numberParts.begin();
00326         std::vector<FileNumbers>::iterator it = boost::next(start);
00327         std::vector<FileNumbers>::iterator itEnd = numberParts.end();
00328         std::ssize_t previousIndex = -1;
00329         std::ssize_t index = -1;
00330         bool split = false;
00331         
00332         for( ; it != itEnd; ++it )
00333         {
00334                 //std::cout << "________________________________________" <<  std::endl;
00335                 //std::cout << "start: " << *start <<  std::endl;
00336                 //std::cout << "it: " << *it <<  std::endl;
00337                 if( getVaryingNumber( index, *start, *it ) )
00338                 {
00339                         if( previousIndex != -1 && // we previously have a sequence and
00340                             index != previousIndex ) // the index is not the same than previous: split!
00341                         {
00342                                 split = true;
00343                         }
00344 //                      else
00345 //                      {
00346 //                              // we don't have a sequence before, there is now one varying number,
00347 //                              // so it's the next sequence
00348 //                      }
00349                 }
00350                 else
00351                 {
00352                         // more than one varying number: split in all cases!
00353                         split = true;
00354                         // if no sequence before... the file is alone...
00355                         // Set the number as the last number.
00356                         if( previousIndex == -1 )
00357                                 previousIndex = start->size() - 1;
00358                 }
00359                 if( split )
00360                 {
00361                         privateBuildSequencesAccordingToPadding( result, defaultSeq, stringParts, start, it, previousIndex );
00362                         split = false;
00363                         index = -1;
00364                         start = it;
00365                 }
00366                 previousIndex = index;
00367         }
00368         if( previousIndex == -1 )
00369                 previousIndex = start->size() - 1;
00370         privateBuildSequencesAccordingToPadding( result, defaultSeq, stringParts, start, it, previousIndex );
00371         
00372         return result;
00373 }
00374 
00375 std::size_t decomposeFilename( const std::string& filename, FileStrings& stringParts, FileNumbers& numberParts, const EMaskOptions& options )
00376 {
00377         static const std::size_t max = std::numeric_limits<std::size_t>::digits10;
00378         std::string regex;
00379         if( options & eMaskOptionsNegativeIndexes )
00380         {
00381                 regex = "[\\+\\-]?+\\d{1," + boost::lexical_cast<std::string>( max ) + "}";
00382         }
00383         else
00384         {
00385                 regex = "\\d{1," + boost::lexical_cast<std::string>( max ) + "}";
00386         }
00387         const boost::regex re( regex );
00388         static const int subs[] = { -1, 0 }; // get before match and current match
00389         boost::sregex_token_iterator m( filename.begin(), filename.end(), re, subs );
00390         boost::sregex_token_iterator end;
00391 
00392 //      std::cout << "________________________________________" << std::endl;
00393 //      std::cout << "filename: " << filename << std::endl;
00394 //      std::cout << "regex: " << regex << std::endl;
00395         while( m != end )
00396         {
00397                 // begin with string id, can be an empty string if str begins with a number
00398 //              std::cout << "stringPart: " << *m << std::endl;
00399                 stringParts.getId().push_back( *m++ );
00400                 if( m != end ) // if end with a string and not a number
00401                 {
00402 //                      std::cout << "numberPart: " << *m << std::endl;
00403                         numberParts.push_back( *m++ );
00404                 }
00405         }
00406         if( stringParts.getId().size() == numberParts.size() )
00407         {
00408                 stringParts.getId().push_back( "" ); // we end with an empty string
00409         }
00410         //std::cout << numberParts.size() << std::endl;
00411         return numberParts.size();
00412 }
00413 
00414 }