OPeNDAP Hyrax Back End Server (BES)  Updated for version 3.8.3
RemoteHttpResource.cc
Go to the documentation of this file.
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of gateway_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 
6 // Copyright (c) 2013 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 // (c) COPYRIGHT URI/MIT 1994-1999
26 // Please read the full copyright statement in the file COPYRIGHT_URI.
27 //
28 // Authors:
29 // ndp Nathan Potter <ndp@opendap.org>
30 
31 #include <unistd.h>
32 
33 #include <sstream>
34 #include <GNURegex.h>
35 
36 #include "util.h"
37 #include "debug.h"
38 #include "Error.h"
39 
40 #include "BESSyntaxUserError.h"
41 #include "BESInternalError.h"
42 #include "BESError.h"
43 
44 #include "BESRegex.h"
45 #include "TheBESKeys.h"
46 
47 #include "GatewayUtils.h"
48 #include "curl_utils.h"
49 #include "RemoteHttpResource.h"
50 
51 
52 using namespace std;
53 
54 
55 
56 namespace gateway {
57 
58 
64 RemoteHttpResource::RemoteHttpResource(const string &url)
65 {
66  _initialized = false;
67 
68  d_fd = 0;
69  d_curl = 0;
70  d_resourceCacheFileName.clear();
71  d_response_headers = new vector<string>();
72  d_request_headers = new vector<string>();
73 
74  if( url.empty() )
75  {
76  string err = "RemoteHttpResource(): Remote resource URL is empty" ;
77  throw BESInternalError( err, __FILE__, __LINE__ ) ;
78  }
79  d_remoteResourceUrl = url;
80  BESDEBUG("gateway", "RemoteHttpResource() - URL: " << d_remoteResourceUrl << endl);
81 
82  /*
83  *
84  *
85  * EXAMPLE: returned value parameter for CURL *
86  *
87  CURL *www_lib_init(CURL **curl); // function type signature
88 
89 
90  CURL *pvparam = 0; // passed value parameter
91  result = www_lib_init(&pvparam); // the call to the method
92  */
93 
94  d_curl = libcurl::init(d_error_buffer); // This may throw either Error or InternalErr
95 
96  libcurl::configureProxy(d_curl, d_remoteResourceUrl); // Configure the a proxy for this url (if appropriate).
97 
98  BESDEBUG("gateway", "RemoteHttpResource() - d_curl: " << d_curl << endl);
99 
100 }
101 
102 
103 
104 RemoteHttpResource::~RemoteHttpResource() {
109 
110  BESDEBUG("gateway", "~RemoteHttpResource() - BEGIN resourceURL: " << d_remoteResourceUrl << endl);
111 
112  delete d_response_headers;
113  d_response_headers = 0;
114  BESDEBUG("gateway", "~RemoteHttpResource() - Deleted d_response_headers." << endl);
115 
116  delete d_request_headers;
117  d_request_headers = 0;
118  BESDEBUG("gateway", "~RemoteHttpResource() - Deleted d_request_headers." << endl);
119 
120  if(!d_resourceCacheFileName.empty()){
121  BESCache3::get_instance()->unlock_and_close(d_resourceCacheFileName);
122  BESDEBUG("gateway", "~RemoteHttpResource() - Closed and unlocked "<< d_resourceCacheFileName << endl);
123  d_resourceCacheFileName.clear();
124  }
125 
126 
127  if(d_curl){
128  curl_easy_cleanup(d_curl);
129  BESDEBUG("gateway", "~RemoteHttpResource() - Called curl_easy_cleanup()." << endl);
130  }
131  d_curl = 0;
132 
133 
134  BESDEBUG("gateway", "~RemoteHttpResource() - END resourceURL: " << d_remoteResourceUrl << endl);
135  d_remoteResourceUrl.clear();
136 
137 }
138 
139 
147 void RemoteHttpResource::retrieveResource()
148 {
149  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - BEGIN resourceURL: " << d_remoteResourceUrl << endl);
150 
151 
152  if(_initialized){
153  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - END Already initialized." << endl);
154  return;
155  }
156 
157  // Get a pointer to the singleton cache instance for this process.
158  BESCache3 *cache = BESCache3::get_instance(TheBESKeys::TheKeys(), (string)"BES.CacheDir",
159  (string)"BES.CachePrefix", (string)"BES.CacheSize");
160 
161  // Get the name of the file in the cache (either the code finds this file or
162  // or it makes it).
163  // @TODO Fix BESCache3 so that you can ask it to NOT strip the suffix which is some kind of funky
164  // Uncompress feature that should be controllable and probably driven by a regex. Then we can stop
165  // adding our own suffix so that BESCache3 can remove it.
166  d_resourceCacheFileName = cache->get_cache_file_name(d_remoteResourceUrl+".uglyHack");
167  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
168 
169  // @TODO MAKE THIS RETRIEVE THE CACHED DATA TYPE IF THE CACHED RESPONSE IF FOUND
170  // We need to know the type of the resource. HTTP headers are the preferred way to determine the type.
171  // Unfortunately, the current code losses both the HTTP headers sent from the request and the derived type
172  // to subsequent accesses of the cached object. Since we have to have a type, for now we just set the type
173  // from the url. If down below we DO an HTTP GET then the headers will be evaluated and the type set by setType()
174  // But really - we gotta fix this.
175  GatewayUtils::Get_type_from_url( d_remoteResourceUrl, d_type );
176  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - d_type: " << d_type << endl);
177 
178  try {
179 
180  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
181  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Remote resource is already in cache. cache_file_name: "
182  << d_resourceCacheFileName << endl );
183  _initialized = true;
184  return;
185  }
186 
187  // Now we actually need to reach out across the interwebs and retrieve the remote resource and put it's
188  // content into a local cache file, given that it's not in the cache.
189  // First make an empty file and get an exclusive lock on it.
190  if (cache->create_and_lock(d_resourceCacheFileName, d_fd)) {
191 
192  // Write the remote resource to the cache file.
193  writeResourceToFile(d_fd);
194 
195 
196 
197 
198 
199  // #########################################################################################################
200  // I think right here is where I would be able to cache the data type/response headers. While I have
201  // the exclusive lock I could open another cache file for metadata and write to it.
202  // #########################################################################################################
203 
204  // Change the exclusive lock on the new file to a shared lock. This keeps
205  // other processes from purging the new file and ensures that the reading
206  // process can use it.
207  cache->exclusive_to_shared_lock(d_fd);
208  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Converted exclusive cache lock to shared lock." << endl );
209 
210 
211  // Now update the total cache size info and purge if needed. The new file's
212  // name is passed into the purge method because this process cannot detect its
213  // own lock on the file.
214  unsigned long long size = cache->update_cache_info(d_resourceCacheFileName);
215  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Updated cache info" << endl );
216 
217  if (cache->cache_too_big(size)){
218  cache->update_and_purge(d_resourceCacheFileName);
219  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Updated and purged cache." << endl );
220  }
221 
222  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - END" << endl );
223 
224  _initialized = true;
225 
226  return;
227  }
228  else {
229  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
230  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Remote resource is in cache. cache_file_name: "
231  << d_resourceCacheFileName << endl );
232  _initialized = true;
233  return;
234  }
235  }
236 
237  string msg = "RemoteHttpResource::retrieveResource() - Failed to acquire cache read lock for remote resource: '";
238  msg += d_remoteResourceUrl + "\n";
239  throw libdap::Error(msg);
240 
241 
242  }
243  catch (...) {
244  BESDEBUG( "gateway", "RemoteHttpResource::retrieveResource() - Caught exception, unlocking cache and re-throw." << endl );
245  cache->unlock_cache();
246  throw;
247  }
248 
249 }
250 
251 
252 
261 void RemoteHttpResource::writeResourceToFile(int fd) {
262  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - BEGIN" << endl );
263 
264 
265  int status = -1;
266  try {
267  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - Saving resource " << d_remoteResourceUrl << " to cache file " << d_resourceCacheFileName << endl );
268  status = libcurl::read_url(d_curl, d_remoteResourceUrl, fd, d_response_headers, d_request_headers, d_error_buffer); // Throws Error.
269  if (status >= 400) {
270  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - HTTP returned an error status: " << status << endl );
271  // delete resp_hdrs; resp_hdrs = 0;
272  string msg = "Error while reading the URL: '";
273  msg += d_remoteResourceUrl;
274  msg += "'The HTTP request returned a status of " + libdap::long_to_string(status) + " which means '";
275  msg += libcurl::http_status_to_string(status) + "' \n";
276  throw libdap::Error(msg);
277  }
278  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - Resource " << d_remoteResourceUrl << " saved to cache file " << d_resourceCacheFileName << endl );
279 
280  // rewind the file
281  lseek(fd,0,SEEK_SET);
282  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - Reset file descriptor." << endl );
283 
284  // @TODO CACHE THE DATA TYPE OR THE HTTP HEADERS SO WHEN WE ARE RETRIEVING THE CACHED OBJECT WE CAN GET THE CORRECT TYPE
285  setType(d_response_headers);
286  }
287  catch (libdap::Error &e) {
288  throw;
289  }
290  BESDEBUG( "gateway", "RemoteHttpResource::writeResourceToFile() - END" << endl );
291 }
292 
293 
294 
295 void RemoteHttpResource::setType(const vector<string> *resp_hdrs)
296 {
297 
298  BESDEBUG("gateway", "RemoteHttpResource::setType() - BEGIN" << endl);
299 
300  string type = "";
301 
302  // Try and figure out the file type first from the
303  // Content-Disposition in the http header response.
304  string disp ;
305  string ctype ;
306 
307 
308  if( resp_hdrs )
309  {
310  vector<string>::const_iterator i = resp_hdrs->begin() ;
311  vector<string>::const_iterator e = resp_hdrs->end() ;
312  for( ; i != e; i++ )
313  {
314  string hdr_line = (*i) ;
315 
316  BESDEBUG("gateway", "RemoteHttpResource::setType() - Evaluating header: " << hdr_line << endl);
317 
318  hdr_line = BESUtil::lowercase( hdr_line ) ;
319 
320  string colon_space = ": ";
321  int index = hdr_line.find(colon_space);
322  string hdr_name = hdr_line.substr(0,index);
323  string hdr_value = hdr_line.substr(index + colon_space.length());
324 
325  BESDEBUG("gateway", "RemoteHttpResource::setType() - hdr_name: '" << hdr_name << "' hdr_value: '" <<hdr_value << "' "<< endl);
326 
327  if( hdr_name.find( "content-disposition" ) != string::npos )
328  {
329  // Content disposition exists
330  BESDEBUG("gateway", "RemoteHttpResource::setType() - Located content-disposition header." << endl);
331  disp = hdr_value ;
332  }
333  if( hdr_name.find( "content-type" ) != string::npos )
334  {
335  BESDEBUG("gateway", "RemoteHttpResource::setType() - Located content-type header." << endl);
336  ctype = hdr_value;
337  }
338  }
339  }
340 
341  if( !disp.empty() )
342  {
343  // Content disposition exists, grab the filename
344  // attribute
346  BESDEBUG( "gateway", "RemoteHttpResource::setType() - Evaluated content-disposition '" << disp
347  << "' matched type: \"" << type
348  << "\"" << endl ) ;
349  }
350 
351  // still haven't figured out the type. Check the content-type
352  // next, translate to the BES module name. It's also possible
353  // that even though Content-disposition was available, we could
354  // not determine the type of the file.
355  if( type.empty() && !ctype.empty() )
356  {
358  BESDEBUG( "gateway", "RemoteHttpResource::setType() - Evaluated content-type '" << ctype << "' matched type \"" << type << "\"" << endl ) ;
359  }
360 
361  // still haven't figured out the type. Now check the actual URL
362  // and see if we can't match the URL to a module name
363  if( type.empty() )
364  {
365  GatewayUtils::Get_type_from_url( d_remoteResourceUrl, type ) ;
366  BESDEBUG( "gateway", "RemoteHttpResource::setType() - Evaluated url '" << d_remoteResourceUrl
367  << "' matched type: \"" << type
368  << "\"" << endl ) ;
369  }
370 
371  // still couldn't figure it out, punt
372  if( type.empty() )
373  {
374  string err = (string)"RemoteHttpResource::setType() - Unable to determine the type of data"
375  + " returned from '" + d_remoteResourceUrl +"' Setting type to 'unknown'" ;
376  BESDEBUG("gateway", err);
377 
378  type = "unknown";
379  //throw BESSyntaxUserError( err, __FILE__, __LINE__ ) ;
380  }
381 
382 
383 
384  // @TODO CACHE THE DATA TYPE OR THE HTTP HEADERS SO WHEN WE ARE RETRIEVING THE CACHED OBJECT WE CAN GET THE CORRECT TYPE
385 
386 
387  d_type = type;
388 
389  BESDEBUG("gateway", "RemoteHttpResource::setType() - END" << endl);
390 
391 }
392 
393 
394 
395 
396 
397 
398 
399 } /* namespace gateway */
virtual void unlock_cache()
Unlock the cache info file.
bool configureProxy(CURL *curl, const string &url)
Configure the proxy options for the passed curl object.
Definition: curl_utils.cc:304
exception thrown if inernal error encountered
virtual bool create_and_lock(const string &target, int &fd)
Create a file in the cache and lock it for write access.
static string lowercase(const string &s)
Convert a string to all lower case.
Definition: BESUtil.cc:182
STL namespace.
static void Get_type_from_content_type(const string &ctype, string &type)
string http_status_to_string(int status)
This function translates an HTTP status code into an error messages.
Definition: curl_utils.cc:84
static void Get_type_from_disposition(const string &disp, string &type)
Implementation of a caching mechanism for compressed data.
Definition: BESCache3.h:60
long read_url(CURL *curl, const string &url, int fd, vector< string > *resp_hdrs, const vector< string > *request_headers, char error_buffer[])
Use libcurl to dereference a URL.
Definition: curl_utils.cc:527
CURL * init(char *error_buffer)
Get's a new instance of CURL* and performs basic configuration of that instance.
Definition: curl_utils.cc:426
virtual bool cache_too_big(unsigned long long current_size) const
look at the cache size; is it too large? Look at the cache size and see if it is too big...
virtual string get_cache_file_name(const string &src, bool mangle=true)
Build the name of file that will holds the uncompressed data from 'src' in the cache.
virtual bool get_read_lock(const string &target, int &fd)
Get a read-only lock on the file if it exists.
virtual void update_and_purge(const string &new_file)
Purge files from the cache.
virtual unsigned long long update_cache_info(const string &target)
Update the cache info file to include 'target'.
static void Get_type_from_url(const string &url, string &type)
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
#define BESDEBUG(x, y)
macro used to send debug information to the debug stream
Definition: BESDebug.h:64
static BESKeys * TheKeys()
Definition: TheBESKeys.cc:48
virtual void unlock_and_close(const string &target)
Unlock the named file.
static BESCache3 * get_instance()
Get an instance of the BESCache3 object.
Definition: BESCache3.cc:83