You are not the first person to run into this problem with iconv. Someone has written a Perl script to solve it.
iconv doesn't handle large files well. From the glibc source code, in iconv/iconv_prog.c:
/* Since we have to deal with arbitrary encodings we must read the whole text in a buffer and process it in one step. */
However, for your particular case, it might be better to write your own UTF-8 validator. You could easily distill iconv -c -f utf8 -t utf8 down to a small C program, with a loop that calls iconv(3). Since UTF-8 is modeless and self-synchronizing, you can process it in chunks.
#include <errno.h> #include <iconv.h> #include <stdio.h> #include <string.h> #include <unistd.h> #define BUFSIZE 4096 /* Copy STDIN to STDOUT, omitting invalid UTF-8 sequences */ int main() { char ib[BUFSIZE], ob[BUFSIZE], *ibp, *obp; ssize_t bytes_read; size_t iblen = 0, oblen; unsigned long long total; iconv_t cd; if ((iconv_t)-1 == (cd = iconv_open("utf8", "utf8"))) { perror("iconv_open"); return 2; } for (total = 0; bytes_read = read(STDIN_FILENO, ib + iblen, sizeof(ib) - iblen); total += bytes_read - iblen) { if (-1 == bytes_read) { /* Handle read error */ perror("read"); return 1; } ibp = ib; iblen += bytes_read; obp = ob; oblen = sizeof(ob); if (-1 == iconv(cd, &ibp, &iblen, &obp, &oblen)) { switch (errno) { case EILSEQ: /* Invalid input multibyte sequence */ fprintf(stderr, "Invalid multibyte sequence at byte %llu\n", 1 + total + sizeof(ib) - iblen); ibp++; iblen--; /* Skip the bad byte next time */ break; case EINVAL: /* Incomplete input multibyte sequence */ break; default: perror("iconv"); return 2; } } write(STDOUT_FILENO, ob, sizeof(ob) - oblen); /* There are iblen bytes at the end of ib that follow an invalid UTF-8 sequence or are part of an incomplete UTF-8 sequence. Move them to the beginning of ib. */ memmove(ib, ibp, iblen); } return iconv_close(cd); }