cURL / Mailing Lists / curl-library / Single Mail

curl-library

getinmemory.c problems

From: Derek Martin <code_at_pizzashack.org>
Date: Thu, 27 Dec 2012 19:36:00 -0600

Hi,

I wrote a small program to fetch an object into memory, similar to
what getinmemory.c does. The reason I wrote my own is because
getinmemory.c has two problems:

1. It calls realloc() every time through the callback. This is
   terribly inefficient, though it appears to be somewhat required by
   libcurl, since (as I found out in my testing) it does not
   necessarily make the content-length available to you until after
   perform() returns.

   The down side of calling realloc() repeatedly is that it
   contributes to memory fragmentation and can cause your data to be
   copied around multiple times. Neither are especially conducive to
   good performance. Since you SHOULD be able to get the content
   length (from the Content-Length header) before you start reading
   the body, it should be possible to avoid doing a realloc() EVER.
   In practice, libcurl sometimes does, and sometimes does not make
   the content length available prior to reading the body via
   curl_easy_getinfo(). You either get the actual value, or zero.

2. If you point getinmemory.c at, say, www.google.com (or several
   other sites I tried, including one of my own), you get no results.
   It just prints "0 bytes retrieved" and dies a happy death.
   Clearly, this is the Wrong Thing™. Though to be honest, it's not
   clear to me why it does that. It does work with most sites I
   tried... including others of my own.

I didn't discover problem #2 until after I already attempted to write
a version that solves #1. My version partially solves #1: when you
actually can get the content length, it does only one malloc(). It
also -- again for reasons I can not explain -- does actually manage to
get results from www.google.com.

It would be really swell if libcurl actually guaranteed the
content-length header's value will be available before perform()
completes. The code required to work around that problem while still
optimizing for the cases it's available is pretty ugly.

And here's the actual code:

/***************************************************************************
 *
 * Copyright © 2012 Derek D. Martin
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * This version of getinmemory.c is provided to attempt to correct a
 * deficiency in the way memory is allocated in the example of the
 * same name provided by Daniel Stenberg at this URL:
 * http://curl.haxx.se/libcurl/c/getinmemory.html
 *
 ***************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>

#include <curl/curl.h>

typedef struct _curl_obj {
    CURL *handle;
    char *buf;
    long long size;
    long long write_pos;
    long long content_len;
} CurlObj;

/*
 * Can't for the life of me figure out why content-length is passed
 * back as a floating point type...
 */
#define float_to_long(x) (((x) < LONG_MIN-0.5 || (x) > LONG_MAX+0.5) ? \
error() : ((x)>=0 ? (long)((x)+0.5) : (long)((x)-0.5)))

/* for when our conversion macro overflows */
int error(void)
{
    fprintf(stderr, "Converting content length to an integer overflowed\n");
    exit(1);
    /* we never get here, but satisfy gcc */
    return 1;
}

/* libcurl calls this a write callback, but seems like it's for reading to me */
size_t read_from_curl(void *buf, size_t size, size_t nmemb, void *userp)
{
    size_t len;
    long long tmplen;
    long status;
    CurlObj *co;
    double content_len;

    /* how many bytes copied by this callback call */
    len = size * nmemb;

    /* allocate the buffer if we need to */
    co = (CurlObj *)userp;
    if (co->buf == NULL){
        /* Here the error() is slightly bogus, but gets the point across */
        if ( (curl_easy_getinfo(co->handle, CURLINFO_CONTENT_LENGTH_DOWNLOAD,
                &content_len)) != CURLE_OK) error();
        co->size = float_to_long(content_len);
        /*
         * Sometimes we don't get content length right away. In that case, we
         * do need to call realloc... So we store what we got.
         */
        printf("Content-Length: %lld\n", co->size);
        co->content_len = co->size;
        if (co->size < size) co->size = len;
        printf("allocating %lld bytes for buffer\n", co->size + 1);
        co->buf = (char *)malloc(co->size + 1);
        co->buf[co->size] = '\0';
    }

    /* deal with the case we didn't get content length originally */
    if (!co->content_len){
        if ( (curl_easy_getinfo(co->handle, CURLINFO_CONTENT_LENGTH_DOWNLOAD,
                &content_len)) != CURLE_OK) error();
        tmplen = float_to_long(content_len);
        if (tmplen){
            printf("Content-Length: %lld\n", co->size);
            printf("re-allocating %llu bytes for buffer\n", tmplen + 1);
            co->buf = (char *)realloc(co->buf, tmplen + 1);
            co->buf[tmplen] = '\0';
            co->content_len = tmplen;
            co->size = tmplen;
        } else {
            printf("Booo! content-length is still 0\n");
            co->size = co->size + len;
            printf("re-allocating %llu bytes for buffer\n", co->size + 1);
            co->buf = (char *)realloc(co->buf, co->size + 1);
        }
    }
    /* copy them into our read buffer */
    memcpy(co->buf + co->write_pos, buf, len);
    co->write_pos += len;
    return len;
}

int main(int argc, char **argv)
{
    CURL *curl_handle;
    CurlObj o;
    int res;

    /* Initialize CURL */
    curl_global_init(CURL_GLOBAL_ALL);
    curl_handle = curl_easy_init();

    /* Now initialize our file object */
    o.buf = NULL; /* allocate this once we know the content-length */
    o.size = 0;
    o.handle = curl_handle;
    o.write_pos = 0;
    o.content_len = 0;

    /* argv[1] = URL. You should make sure it exists but this is an example. */
    curl_easy_setopt(curl_handle, CURLOPT_URL, argv[1]);
    curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, read_from_curl);
    curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&o);
    curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "pizzacode/1.0");
    /* fetch the URL - we should check the return value but whatever... */
    if ((res = curl_easy_perform(curl_handle) != CURLE_OK)) curl_easy_strerror(res);
    curl_easy_cleanup(curl_handle);
    printf("%s\n", o.buf);
    /* free(NULL) is a no-op, pointless to check for it */
    free(o.buf);
    /* BUT, set it to NULL after you free() so it isn't freed twice */
    o.buf = NULL;
    curl_global_cleanup();
    return 0;
}

-------------------------------------------------------------------
List admin: http://cool.haxx.se/list/listinfo/curl-library
Etiquette: http://curl.haxx.se/mail/etiquette.html

  • application/pgp-signature attachment: stored
Received on 2012-12-28