I’m reading Jon Bentley’s Programming Pearls and one of the interesting exercises was to find the longest substring which occurs twice in a given string of length n.
There’s a naïve solution where you look at every pair of (distinct) indexes (i, j), and calculate the length of the common prefix of the substrings starting at those locations; this is O(n2) assuming that the length of the eventual maximum substring is O(1) (that is, << n.)
Jon shows that there is an O(n log n) algorithm, which is a significant savings if n is large (e.g., War and Peace.) This involves building an array of all substrings, then sorting them lexically, then walking the sorted array; the trick is that now we only need to check pairs of indexes that correspond to adjacent entries in the array. That step can be done in O(n) time; the O(n log n) comes from the sorting step.
I wrote up a quick app that implements his suggested algorithm. Source follows.
// main.cpp #include <windows.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <search.h> #define LOG(format, ...) wprintf(format L"n", __VA_ARGS__) template<typename T> class DeleteArrayOnExit { public: DeleteArrayOnExit(T *p) : m_p(p), m_canceled(false) {} ~DeleteArrayOnExit() { if (!m_canceled) { delete [] m_p; } } void Cancel() { m_canceled = true; } void Swap(T *p) { m_p = p; } private: T *m_p; bool m_canceled; }; // caller must delete [] the buffer when done with it LPWSTR ReadFromStdIn(); int __cdecl pwcscmp(const void *pstra, const void *pstrb) { return wcscmp(*(LPWSTR*)pstra, *(LPWSTR*)pstrb); } // returns length of longest common substring // don't include the null terminator if both strings are identical // e.g., comlen(banana, bananas) == comlen(banana, banana) == 6 int comlen(LPCWSTR a, LPCWSTR b) { int i = 0; // keep going while the strings are the same and not ended while (a[i] && (a[i] == b[i])) { i++; } return i; } int _cdecl wmain() { LPWSTR text = ReadFromStdIn(); if (nullptr == text) { return -__LINE__; } DeleteArrayOnExit<WCHAR> deleteText(text); size_t size = wcslen(text) + 1; LPWSTR *suffixes = new LPWSTR[size]; if (nullptr == suffixes) { LOG(L"Could not allocate memory for suffixes"); return -__LINE__; } DeleteArrayOnExit<LPWSTR> deleteSuffixes(suffixes); for (size_t i = 0; i < size; i++) { suffixes[i] = &text[i]; } qsort(suffixes, size, sizeof(LPWSTR), pwcscmp); // find the longest common adjacent pair LPWSTR szMax = suffixes[0]; size_t lenMax = 0; for (size_t i = 0; i < size - 1; i++) { size_t len = comlen(suffixes[i], suffixes[i + 1]); if (len > lenMax) { lenMax = len; szMax = suffixes[i]; } } WCHAR *substring = new WCHAR[lenMax + 1]; if (nullptr == substring) { LOG(L"Could not allocate memory for substring"); return -__LINE__; } DeleteArrayOnExit<WCHAR> deleteSubstring(substring); if (0 != wcsncpy_s(substring, lenMax + 1, szMax, lenMax)) { LOG(L"wcsncpy_s failed"); return -__LINE__; } substring[lenMax] = L''; // intentionally not using LOG to avoid trailing newline wprintf(L"%s", substring); return 0; } LPWSTR ReadFromStdIn() { WCHAR *text = new WCHAR[1]; if (nullptr == text) { LOG(L"Could not allocate memory for text"); return nullptr; } DeleteArrayOnExit<WCHAR> deleteText(text); text[0] = L''; // read a 2 KB chunk const size_t buffer_size = 1024; WCHAR *buffer = new WCHAR[buffer_size]; if (nullptr == buffer) { LOG(L"Could not allocate memory for buffer"); return nullptr; } DeleteArrayOnExit<WCHAR> deleteBuffer(buffer); bool done = false; do { if (fgetws(buffer, buffer_size, stdin)) { size_t size = wcslen(text) + wcslen(buffer) + 1; WCHAR *new_text = new WCHAR[size]; if (nullptr == new_text) { LOG(L"Could not allocate memory for new text"); return nullptr; } DeleteArrayOnExit<WCHAR> deleteNewText(new_text); WCHAR *dest = new_text; if (0 != wcscpy_s(dest, size, text)) { LOG(L"wcscpy_s failed"); return nullptr; } dest += wcslen(text); size -= wcslen(text); if (0 != wcscpy_s(dest, size, buffer)) { LOG(L"wcscpy_s failed"); return nullptr; } // that should do it for copying // now swap new_text => text delete [] text; text = new_text; deleteText.Swap(new_text); deleteNewText.Cancel(); } else if (feof(stdin)) { done = true; } else { LOG(L"Error reading from STDIN"); return nullptr; } } while (!done); deleteText.Cancel(); return text; }
2 thoughts on “Finding the longest substring which occurs twice in a given string”