Finding the longest substring which occurs twice in a given string

I’m reading Jon Bentley’s Programming Pearls and one of the interesting exercises was to find the longest substring which occurs twice in a given string of length n.

There’s a naïve solution where you look at every pair of (distinct) indexes (i, j), and calculate the length of the common prefix of the substrings starting at those locations; this is O(n2) assuming that the length of the eventual maximum substring is O(1) (that is, << n.)

Jon shows that there is an O(n log n) algorithm, which is a significant savings if n is large (e.g., War and Peace.)  This involves building an array of all substrings, then sorting them lexically, then walking the sorted array; the trick is that now we only need to check pairs of indexes that correspond to adjacent entries in the array.  That step can be done in O(n) time; the O(n log n) comes from the sorting step.

I wrote up a quick app that implements his suggested algorithm.  Source follows.

// main.cpp

#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <search.h>

#define LOG(format, ...) wprintf(format L"n", __VA_ARGS__)

template<typename T>
class DeleteArrayOnExit {
public:
    DeleteArrayOnExit(T *p) : m_p(p), m_canceled(false) {}
    ~DeleteArrayOnExit() { if (!m_canceled) { delete [] m_p; } }
    void Cancel() { m_canceled = true; }
    void Swap(T *p) { m_p = p; }
private:
    T *m_p;
    bool m_canceled;
};

// caller must delete [] the buffer when done with it
LPWSTR ReadFromStdIn();
int __cdecl pwcscmp(const void *pstra, const void *pstrb) {
    return wcscmp(*(LPWSTR*)pstra, *(LPWSTR*)pstrb);
}

// returns length of longest common substring
// don't include the null terminator if both strings are identical
// e.g., comlen(banana, bananas) == comlen(banana, banana) == 6
int comlen(LPCWSTR a, LPCWSTR b) {
    int i = 0;

    // keep going while the strings are the same and not ended
    while (a[i] && (a[i] == b[i])) { i++; }

    return i;
}

int _cdecl wmain() {
    LPWSTR text = ReadFromStdIn();
    if (nullptr == text) {
        return -__LINE__;
    }
    DeleteArrayOnExit<WCHAR> deleteText(text);
    
    size_t size = wcslen(text) + 1;
    LPWSTR *suffixes = new LPWSTR[size];
    if (nullptr == suffixes) {
        LOG(L"Could not allocate memory for suffixes");
        return -__LINE__;
    }
    DeleteArrayOnExit<LPWSTR> deleteSuffixes(suffixes);
    
    for (size_t i = 0; i < size; i++) {
        suffixes[i] = &text[i];
    }
    
    qsort(suffixes, size, sizeof(LPWSTR), pwcscmp);
    
    // find the longest common adjacent pair
    LPWSTR szMax = suffixes[0];
    size_t lenMax = 0;
    for (size_t i = 0; i < size - 1; i++) {
        size_t len = comlen(suffixes[i], suffixes[i + 1]);
        if (len > lenMax) {
            lenMax = len;
            szMax = suffixes[i];
        }
    }
    
    WCHAR *substring = new WCHAR[lenMax + 1];
    if (nullptr == substring) {
        LOG(L"Could not allocate memory for substring");
        return -__LINE__;
    }
    DeleteArrayOnExit<WCHAR> deleteSubstring(substring);
    if (0 != wcsncpy_s(substring, lenMax + 1, szMax, lenMax)) {
        LOG(L"wcsncpy_s failed");
        return -__LINE__;
    }
    substring[lenMax] = L'';
    
    // intentionally not using LOG to avoid trailing newline
    wprintf(L"%s", substring);
    
    return 0;
}

LPWSTR ReadFromStdIn() {
    WCHAR *text = new WCHAR[1];
    if (nullptr == text) {
        LOG(L"Could not allocate memory for text");
        return nullptr;
    }
    DeleteArrayOnExit<WCHAR> deleteText(text);
    text[0] = L'';

    // read a 2 KB chunk
    const size_t buffer_size = 1024;
    WCHAR *buffer = new WCHAR[buffer_size];
    if (nullptr == buffer) {
        LOG(L"Could not allocate memory for buffer");
        return nullptr;
    }
    DeleteArrayOnExit<WCHAR> deleteBuffer(buffer);
    
    bool done = false;
    do {
        if (fgetws(buffer, buffer_size, stdin)) {
            size_t size = wcslen(text) + wcslen(buffer) + 1;
            WCHAR *new_text = new WCHAR[size];
            if (nullptr == new_text) {
                LOG(L"Could not allocate memory for new text");
                return nullptr;
            }
            DeleteArrayOnExit<WCHAR> deleteNewText(new_text);
            
            WCHAR *dest = new_text;
            
            if (0 != wcscpy_s(dest, size, text)) {
                LOG(L"wcscpy_s failed");
                return nullptr;
            }
            dest += wcslen(text);
            size -= wcslen(text);
            
            if (0 != wcscpy_s(dest, size, buffer)) {
                LOG(L"wcscpy_s failed");
                return nullptr;
            }

            // that should do it for copying
            // now swap new_text => text
            
            delete [] text;
            text = new_text;

            deleteText.Swap(new_text);
            deleteNewText.Cancel();
            
        } else if (feof(stdin)) {
            done = true;
        } else {
            LOG(L"Error reading from STDIN");
            return nullptr;
        }
    } while (!done);
    
    deleteText.Cancel();
    return text;
}

2 thoughts on “Finding the longest substring which occurs twice in a given string

Leave a comment