/*
 * nasd_obj.c
 *
 * Basic object-management for on-disk fs
 *
 * Author: Jim Zelenka
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_common.h>
#include <nasd/nasd_timer.h>
#include <nasd/nasd_security_dr.h>
#include <nasd/nasd_layout.h>
#include <nasd/nasd_udppipe.h>
#include <nasd/nasd_remote.h>

#define DBG_READ_SIMPLE 0

#if 0
/*
 * Garth says it's okay to skip doing this here in the
 * interest of performance. The theory is, we really have
 * that NVRAM log where we've marked this change until
 * we commit the block for real.
 */
#define NASD_FLUSH_BLOCKS      1
#define NASD_FLUSH_BLOCKS_SYNC 1
#endif /* 0 */

/*
 * Take obj rwlock before obj lock
 */

extern nasd_nodenum_t nasd_reserved_nodes;

/*
 * You've heard of zero-filling blocks, right? Well, this is the zero-fill
 * block. Don't ask.
 *
 * Also. Whatever you do, don't assign nonzero values here.
 */
char nasd_odc_zeroblk[NASD_OD_BASIC_BLOCKSIZE];

/*
 * Byte-sink. Write only.
 */
char nasd_odc_bitbucket[NASD_OD_BASIC_BLOCKSIZE];

/*
 * This may have any value in the range 1..NASD_DRIVE_MAX_BLOCKMAPCHUNK
 * It should *not* be patched while the drive is dispatching
 * I/Os. Someone should do some studies sometime to see how
 * diddling this affects performance.
 */
extern int nasd_od_obj_blockmapchunk;

void nasd_od_commit_inflight(void *commit_rock, nasd_offset_t start,
                             nasd_len_t len);

void
nasd_odc_get_attr_from_ent(
  nasd_odc_ent_t    *ent,
  nasd_attribute_t  *attrp,
  int                node_rwlock_held)
{
  nasd_od_node_t *np;

  np = ent->data.node;
  if (node_rwlock_held == 0) {
    NASD_ODC_RLOCK_BLOCK_DATA(ent);
  }
  attrp->block_preallocation = np->blocks_preallocated;
  attrp->blocks_used = np->blocks_allocated;
  attrp->block_size = NASD_OD_BASIC_BLOCKSIZE;
  attrp->av = np->akvers;
  attrp->object_len = np->object_len;
  attrp->attr_modify_time = np->attr_modify_time;
  attrp->object_modify_time = np->object_modify_time;
  attrp->object_create_time = np->object_create_time;
  attrp->fs_attr_modify_time = np->fs_attr_modify_time;
  attrp->fs_object_modify_time = np->fs_object_modify_time;
  attrp->layout_hint.lh_nid = NASDID_NULL;
  memcpy((char *)attrp->fs_specific, (char *)np->fs_specific,
    NASD_FS_SPECIFIC_INFO_SIZE);
  if (node_rwlock_held == 0) {
    NASD_ODC_RUNLOCK_BLOCK_DATA(ent);
  }
}

nasd_status_t
nasd_od_obj_sysinit()
{
  bzero((char *)nasd_odc_zeroblk, NASD_OD_BASIC_BLOCKSIZE);
  return(NASD_SUCCESS);
}

/*
 * Call with node data write lock held
 *
 * Takes an obj in small (atomic) mode, and
 * converts it to regular mode, by allocating
 * a data block and copying bits into it.
 */
nasd_status_t
nasd_obj_deatomize(
  nasd_odc_ent_t  *ne,
  int              partnum)
{
  nasd_blkrec_t blkrec;
  nasd_od_direct_ptr_t *np_ptrs;
  nasd_odc_ent_t *ent;
  nasd_od_node_t *np;
  nasd_status_t rc;
  nasd_blkno_t sv;

  np = ne->data.node;
  NASD_ASSERT(np->flags&NASD_ND_ATOMIC);
  NASD_ASSERT(np->object_len <= NASD_ND_ATOMIC_SIZE);
  np_ptrs = (nasd_od_direct_ptr_t *)np->ptrs;
#if NASD_OD_EXT_PTR > 0 
  blkrec.flags = 0;
  blkrec.odc_entp= NULL; 
#endif /* NASD_OD_EXT_PTR > 0 */
  /*
   * Here we play a dirty trick on bmap. We have a node
   * which has nonzero bits in the pointer block, but
   * has no blocks mapped. Since we're only mapping the
   * first block, we save out the first few bits from the
   * pointer area on our stack (variable sv), and
   * zero just those bits so bmap sees a consistent universe.
   */
  sv = np_ptrs[0].blkno;
  np_ptrs[0].blkno = 0;
  np->flags &= ~NASD_ND_ATOMIC;
  rc = nasd_od_bmap(ne, 0, 1, 0, 0, partnum, NASD_ODC_B_FAULT|NASD_ODC_B_ALLOC,
    &blkrec, NULL, NULL, NULL);
  if (rc) {
    nasd_od_bmap_release(&blkrec, 1, 0, 0);
    np->flags |= NASD_ND_ATOMIC;
    np_ptrs[0].blkno = sv;
    return(rc);
  }
  rc = nasd_odc_block_get(ne, blkrec.blkno, NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
    &ent, ne->identifier, 0, NASD_ODC_T_DATA, NULL);
  if (rc) {
    /* XXX we leak a block here */
    nasd_od_bmap_release(&blkrec, 1, 0, 0);
    np->flags |= NASD_ND_ATOMIC;
    np_ptrs[0].blkno = sv;
    return(rc);
  }
#if 0
nasd_printf("nid 0x%" NASD_ID_FMT " offset %6lu blk %4u 0x%lx A\n", ne->identifier, (unsigned long)0UL, ent->blkno, ent);
#endif
  NASD_ODC_WLOCK_BLOCK_DATA(ent);
  NASD_ODC_LOCK_BLOCK(ent);
  nasd_odc_wait_not_busy(ent);
  nasd_odc_dirty_ent(ent);
  ent->data.blk[0] = sv;
  if (np->object_len > sizeof(nasd_blkno_t)) {
    bcopy((char *)np->ptrs+sizeof(nasd_blkno_t),
      (char *)ent->data.buf+sizeof(nasd_blkno_t),
      np->object_len-sizeof(nasd_blkno_t));
    bzero((char *)np->ptrs+sizeof(nasd_blkno_t),
      NASD_ND_ATOMIC_SIZE-sizeof(nasd_blkno_t));
  }
  ent->data_flags &= ~(NASD_CD_INVALID|NASD_CD_NZ);

  NASD_ODC_UNLOCK_BLOCK(ent);
  NASD_ODC_WUNLOCK_BLOCK_DATA(ent);
  NASD_BROADCAST_COND(ent->cond);
  nasd_odc_block_release(ent);
  nasd_od_bmap_release(&blkrec, 1, 0, 0);
  np->flags &= ~NASD_ND_ATOMIC;
#if ((NASD_SECURE_RPCS_ENABLE > 0) & (NASD_OD_EXT_PTR > 0 ))
  bzero((char *)np_ptrs[0].digest, sizeof(nasd_digest_t));
#endif /* ((NASD_SECURE_RPCS_ENABLE > 0) & (NASD_OD_EXT_PTR > 0 )) */
  return(NASD_SUCCESS);
}

nasd_status_t
nasd_obj_create(
  int                 partnum,
  nasd_attribute_t   *in_attr,
  nasd_fieldmask_t    in_fieldmask,
  nasd_identifier_t  *out_id,
  nasd_attribute_t   *out_attr,
  int                 part_lock_held)
{
  nasd_drive_opholder_t *cr_oh;
  nasd_status_t rc, rc2;
  nasd_identifier_t nid;
  nasd_uint64 n64;
  int i;

  if (!NASD_OD_PARTNUM_VALID(partnum))
    return(NASD_BAD_PARTITION);

  if (in_fieldmask&NASD_ATTR_BLOCKS_USED)
    return(NASD_BAD_ATTR_SET);

  if (in_fieldmask&NASD_ATTR_BLOCK_SIZE)
    return(NASD_OP_NOT_SUPPORTED);

  NASD_DRIVE_GET_OPHOLDER(cr_oh);
  if (cr_oh == NULL)
    return(NASD_NO_MEM);

  if (in_fieldmask&NASD_ATTR_BLOCK_PREALLOCATION)
    cr_oh->opholder_crobj.prealloc_blocks = in_attr->block_preallocation;
  else
    cr_oh->opholder_crobj.prealloc_blocks = 0;

  if (in_fieldmask&NASD_ATTR_LAYOUT_HINT)
    cr_oh->opholder_crobj.layout_hint = &in_attr->layout_hint;
  else
    cr_oh->opholder_crobj.layout_hint = NULL;

  cr_oh->opholder_crobj.pre_exle = NULL;

  nasd_gettime(&cr_oh->opholder_crobj.cur_time);

  cr_oh->opholder_crobj.part = &PART(partnum);
  cr_oh->opholder_crobj.icp = &nasd_odc_state->parts[partnum];

  if (part_lock_held == 0) {
    NASD_ODC_ICPART_LOCK_WRITE(cr_oh->opholder_crobj.icp);
  }
  cr_oh->opholder_crobj.icp->last_objlist_npt = 0;
  cr_oh->opholder_crobj.icp->last_objlist_off = 0;

  if (NASD_OD_INVALID_PART(cr_oh->opholder_crobj.part)) {
    if (part_lock_held == 0) {
      NASD_ODC_ICPART_UNLOCK_WRITE(cr_oh->opholder_crobj.icp);
    }
    NASD_DRIVE_FREE_OPHOLDER(cr_oh);
    return(NASD_BAD_PARTITION);
  }
  cr_oh->opholder_crobj.newid = NASDID_NULL;

  /*
   * Get node and possibly preallocation
   */
  rc = nasd_od_layout_get_node_block(partnum,
    cr_oh->opholder_crobj.prealloc_blocks,
    cr_oh->opholder_crobj.layout_hint,
    &cr_oh->opholder_crobj.n_exle,
    &cr_oh->opholder_crobj.pre_exle);
  if (rc) {
    if (part_lock_held == 0) {
      NASD_ODC_ICPART_UNLOCK_WRITE(cr_oh->opholder_crobj.icp);
    }
    NASD_DRIVE_FREE_OPHOLDER(cr_oh);
    return(rc);
  }

  /*
   * Find open slot in node pagetable for this object.
   * We take advantage of a weaselly trick to check quickly
   * here- every created node is a ref on the node pagetable
   * object; therefore, by counting the refs on a node
   * pagetable block, we know how many free slots there are,
   * without loading the block or searching it.
   *
   * We don't always start at the beginning of the node pagetable;
   * to increase "create bandwidth" (number of back-to-back creates
   * we can do in a second), we stagger objects through the NPT
   * blocks, to avoid having the next create wait for the previous
   * create's writeback of the NPT blocks to complete.
   */
  cr_oh->opholder_crobj.fn = NASD_ODC_REFBLK_OF(nasd_odc_state->disk->npt_ext.first);
  cr_oh->opholder_crobj.ln = NASD_ODC_REFBLK_OF(nasd_odc_state->disk->npt_ext.last);
  cr_oh->opholder_crobj.nb = 0;
  cr_oh->opholder_crobj.re = NULL;
  rc = NASD_SUCCESS;

  cr_oh->opholder_crobj.f = nasd_odc_state->cr_ind;
#if NASD_DRIVE_ROTATE_CREATE > 0
  nasd_odc_state->cr_ind++;
  if (nasd_odc_state->cr_ind >= nasd_odc_state->npt_sz) {
    nasd_odc_state->cr_ind = 0;
  }
#endif /* NASD_DRIVE_ROTATE_CREATE > 0 */
  for(cr_oh->opholder_crobj.iblk=0;
    ((cr_oh->opholder_crobj.iblk<nasd_odc_state->npt_sz)
      &&(cr_oh->opholder_crobj.nb==0)&&(rc==NASD_SUCCESS))
    ;)
  {
    /*
     * Stagger cur_blk through NPT
     */
    cr_oh->opholder_crobj.cur_blk =
      ((cr_oh->opholder_crobj.iblk+cr_oh->opholder_crobj.f)
        %nasd_odc_state->npt_sz)
      + nasd_odc_state->disk->npt_ext.first;
    cr_oh->opholder_crobj.cur_ref_blk =
      NASD_ODC_REFBLK_OF(cr_oh->opholder_crobj.cur_blk);
    cr_oh->opholder_crobj.fb =
      NASD_ODC_REFBLK_FIRST(cr_oh->opholder_crobj.cur_ref_blk);
    cr_oh->opholder_crobj.lb =
      NASD_ODC_REFBLK_LAST(cr_oh->opholder_crobj.cur_ref_blk);
    if (cr_oh->opholder_crobj.re &&
      (cr_oh->opholder_crobj.re->blkno != cr_oh->opholder_crobj.cur_ref_blk))
    {
      NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.re);
      nasd_odc_block_release(cr_oh->opholder_crobj.re);
      cr_oh->opholder_crobj.re = NULL;
    }
    if (cr_oh->opholder_crobj.re == NULL) {
      rc = nasd_odc_block_get(NULL, cr_oh->opholder_crobj.cur_ref_blk,
        NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
        &cr_oh->opholder_crobj.re, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
      NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.re);
    }
    else {
      rc = NASD_SUCCESS;
    }
    if (rc == NASD_SUCCESS) {
      nasd_odc_wait_not_busy_invalid(cr_oh->opholder_crobj.re);
      NASD_ASSERT(!(cr_oh->opholder_crobj.re->data_flags&NASD_CD_INVALID));
      rc = NASD_NO_MORE_OBJECTS;
      for(i=NASD_ODC_OFF_IN_REFBLK(cr_oh->opholder_crobj.cur_blk);
        ((cr_oh->opholder_crobj.cur_blk<=nasd_odc_state->disk->npt_ext.last)
        &&(i<NASD_OD_REFS_PER_BLOCK));cr_oh->opholder_crobj.cur_blk++,i++)
      {
        /*
         * Check below is <=, not <, because there's one
         * ref for just existing.
         */
        if (cr_oh->opholder_crobj.re->data.cnt[i]
          <= NASD_OD_NODES_PER_NPT_BLOCK)
        {
#if NASD_DRIVE_ROTATE_CREATE == 0
          nasd_odc_state->cr_ind =
            (i+cr_oh->opholder_crobj.f)%nasd_odc_state->npt_sz;
#endif /* NASD_DRIVE_ROTATE_CREATE == 0 */
          cr_oh->opholder_crobj.nb = cr_oh->opholder_crobj.cur_blk;
          rc = NASD_SUCCESS;
          break;
        }
      }
    }
    else {
      cr_oh->opholder_crobj.re = NULL;
    }
  }
  /*
   * Now nb is the block number of a node pagetable block with
   * an open slot.
   */
  if ((cr_oh->opholder_crobj.re == NULL)
    || (rc != NASD_SUCCESS) || (cr_oh->opholder_crobj.nb == 0))
  {
    /*
     * Something went wrong- didn't find an open slot, had an error
     * getting a ref block, or some other internal failure. Bail hard.
     * XXX Eventually, if we're out of slots, we should move on to the
     * lvl-2 pagetable.
     */
    if (cr_oh->opholder_crobj.re) {
      NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.re);
      nasd_odc_block_release(cr_oh->opholder_crobj.re);
    }
    /*
     * We don't yet have on-disk references on the node block
     * or preallocation range, so just hand them back to the
     * free-block tracker.
     */
    rc2 = nasd_od_layout_node_fail_create(partnum,
      cr_oh->opholder_crobj.prealloc_blocks,
      cr_oh->opholder_crobj.n_exle, cr_oh->opholder_crobj.pre_exle);
    if (rc2 != NASD_SUCCESS)
      NASD_PANIC();
    if (part_lock_held == 0) {
      NASD_ODC_ICPART_UNLOCK_WRITE(cr_oh->opholder_crobj.icp);
    }
    NASD_DRIVE_FREE_OPHOLDER(cr_oh);
    return(NASD_NO_MORE_OBJECTS);
  }
  /*
   * Found an open slot. Increment refcnt on the node pt block,
   * release the block.
   *
   * Note that we only track ref counts for nodes held on the primary
   * node pagetable blocks, not the secondary. This is a performance
   * optimization, since that information is fully recoverable.
   */
  NASD_ASSERT(cr_oh->opholder_crobj.nb);
  i = NASD_ODC_OFF_IN_REFBLK(cr_oh->opholder_crobj.nb);
  nasd_odc_dirty_ent(cr_oh->opholder_crobj.re);
  cr_oh->opholder_crobj.re->data.cnt[i]++;
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.re);
  nasd_odc_block_release(cr_oh->opholder_crobj.re);

  /*
   * Get the actual node pagetable block, but don't update or write
   * it just yet.
   */
  cr_oh->opholder_crobj.nblk = 0;

  rc = nasd_odc_block_get(NULL, cr_oh->opholder_crobj.nb,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
    &cr_oh->opholder_crobj.npte, NASD_ID_NULL, 0,
    NASD_ODC_T_NPT1, NULL);
  NASD_ASSERT(rc == NASD_SUCCESS);
  NASD_ODC_WLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte);
  NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.npte);
  nasd_odc_wait_not_busy_invalid(cr_oh->opholder_crobj.npte);
  NASD_ASSERT(!(cr_oh->opholder_crobj.npte->data_flags&NASD_CD_INVALID));
  cr_oh->opholder_crobj.npti = (-1);
  for(i=0;i<NASD_OD_NODES_PER_NPT_BLOCK;i++) {
    if (cr_oh->opholder_crobj.npte->data.pte[i].blkno == 0) {
      /* found an empty slot */
      cr_oh->opholder_crobj.nblk =
        i + ((cr_oh->opholder_crobj.nb-nasd_odc_state->disk->npt_ext.first)
        * NASD_OD_NODES_PER_NPT_BLOCK);
      cr_oh->opholder_crobj.npti = i;
      cr_oh->opholder_crobj.npte->data.pte[cr_oh->opholder_crobj.npti].blkno =
        NASD_CR_BLKNO;
      cr_oh->opholder_crobj.nodenum = cr_oh->opholder_crobj.nblk
        + nasd_reserved_nodes;
      break;
    }
  }
  NASD_ASSERT(cr_oh->opholder_crobj.npti>=0);
  nasd_odc_dirty_ent(cr_oh->opholder_crobj.npte);
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.npte);
  NASD_ODC_WUNLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte);

  /*
   * Get duplicate npt block, check for match
   */
  cr_oh->opholder_crobj.nb2 = cr_oh->opholder_crobj.nb
    + nasd_odc_state->npt_sz;
  rc = nasd_odc_block_get(NULL, cr_oh->opholder_crobj.nb2,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
    &cr_oh->opholder_crobj.npte2, NASD_ID_NULL, 0,
    NASD_ODC_T_NPT2, NULL);
  NASD_ASSERT(rc == NASD_SUCCESS);
  NASD_ODC_WLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte2);
  NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.npte2);
  nasd_odc_wait_not_busy_invalid(cr_oh->opholder_crobj.npte2);
  NASD_ASSERT(!(cr_oh->opholder_crobj.npte2->data_flags&NASD_CD_INVALID));
  if (cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].blkno
    != NASD_NULL_BLKNO)
  {
    nasd_printf("nb=%u nb2=%u\n", cr_oh->opholder_crobj.nb,
      cr_oh->opholder_crobj.nb2);
    nasd_printf("npti=%d\n", cr_oh->opholder_crobj.npti);
    nasd_printf("npt=%u..%u and %u..%u\n", nasd_odc_state->disk->npt_ext.first,
      nasd_odc_state->disk->npt_ext.last, nasd_odc_state->disk->npt2_ext.first,
      nasd_odc_state->disk->npt2_ext.last);
    nasd_printf("npte->data.pte[npti].blkno=%u\n",
      cr_oh->opholder_crobj.npte->data.pte[cr_oh->opholder_crobj.npti].blkno);
    nasd_printf("npte2->data.pte[npti].blkno=%u\n",
      cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].blkno);
    NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.npte2);
    NASD_ODC_WUNLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte2);
    nasd_odc_block_release(cr_oh->opholder_crobj.npte2);
    nasd_odc_block_release(cr_oh->opholder_crobj.npte);
    nasd_od_check_npt();
    NASD_PANIC();
    if (part_lock_held == 0) {
      NASD_ODC_ICPART_UNLOCK_WRITE(cr_oh->opholder_crobj.icp);
    }
  }
  NASD_ASSERT(
    cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].blkno
    == NASD_NULL_BLKNO);
  cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].blkno =
    NASD_CR_BLKNO;

  nasd_odc_dirty_ent(cr_oh->opholder_crobj.npte2);
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.npte2);
  NASD_ODC_WUNLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte2);

  /*
   * First, write out the new node, so if we die after writing
   * the node pagetable block, the drive-verifier is more sane.
   */

  /* compute new identifier */
  nid = cr_oh->opholder_crobj.nodenum;
  n64 = (nasd_uint64)cr_oh->opholder_crobj.part->generation;
  n64 <<= 57;
  n64 &= nasd_int64cast(0xfe00000000000000);
  nid |= n64;
  n64 = (nasd_uint64)partnum;
  n64 <<= 53;
  n64 &= nasd_int64cast(0x01e0000000000000);
  nid |= n64;

  /* get block for this node */
  rc = nasd_odc_block_get(NULL, cr_oh->opholder_crobj.n_exle->range.first,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
    &cr_oh->opholder_crobj.ne, nid, 0, NASD_ODC_T_NODE, NULL);
  if (rc != NASD_SUCCESS) {
    nasd_printf("%s:%d rc=0x%x (%s)\n", __FILE__, __LINE__, rc,
      nasd_error_string(rc));
    NASD_PANIC();
  }
  NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.ne);
  /*
   * Block might be in the process of being recycled from
   * the anonymous state with a pending I/O. We have to wait
   * for that I/O to complete.
   */
  if (cr_oh->opholder_crobj.ne->data_flags&NASD_CD_BUSY) {
    NASD_ODC_CSINC(create_busy);
    nasd_odc_wait_not_busy(cr_oh->opholder_crobj.ne);
  }

  /*
   * Init node attributes
   */
  nasd_odc_dirty_ent(cr_oh->opholder_crobj.ne);
  cr_oh->opholder_crobj.np = cr_oh->opholder_crobj.ne->data.node;
  cr_oh->opholder_crobj.np->blocks_allocated = 1;
  cr_oh->opholder_crobj.np->blocks_preallocated =
    cr_oh->opholder_crobj.prealloc_blocks;
  if (in_fieldmask&NASD_ATTR_OBJECT_LEN)
    cr_oh->opholder_crobj.np->object_len = in_attr->object_len;
  else
    cr_oh->opholder_crobj.np->object_len = 0;
  cr_oh->opholder_crobj.np->attr_modify_time =
    cr_oh->opholder_crobj.cur_time;
  cr_oh->opholder_crobj.np->object_modify_time =
    cr_oh->opholder_crobj.cur_time;
  cr_oh->opholder_crobj.np->object_create_time =
    cr_oh->opholder_crobj.cur_time;
  if (in_fieldmask&NASD_ATTR_ATTR_MODIFY_TIME) {
    cr_oh->opholder_crobj.np->fs_attr_modify_time = in_attr->attr_modify_time;
  }
  else {
    cr_oh->opholder_crobj.np->fs_attr_modify_time =
      cr_oh->opholder_crobj.cur_time;
  }
  if (in_fieldmask&NASD_ATTR_MODIFY_TIME) {
    cr_oh->opholder_crobj.np->fs_attr_modify_time = in_attr->object_modify_time;
  }
  else {
    cr_oh->opholder_crobj.np->fs_object_modify_time =
      cr_oh->opholder_crobj.cur_time;
  }
  if (in_fieldmask&NASD_ATTR_AV)
    cr_oh->opholder_crobj.np->akvers = in_attr->av;
  else
    cr_oh->opholder_crobj.np->akvers = 0;
  if (in_fieldmask&NASD_ATTR_FS_SPECIFIC) {
    bcopy((char *)in_attr->fs_specific,
      (char *)cr_oh->opholder_crobj.np->fs_specific,
      NASD_FS_SPECIFIC_INFO_SIZE);
  }
  else {
    bzero((char *)cr_oh->opholder_crobj.np->fs_specific,
      NASD_FS_SPECIFIC_INFO_SIZE);
  }
  cr_oh->opholder_crobj.np->refcnt = 0;
  NASD_ASSERT((partnum&NASD_ND_PARTMASK) == partnum);
  /* low bits of flags are partnum */
  cr_oh->opholder_crobj.np->flags = partnum;
  cr_oh->opholder_crobj.np->cow_src = NASD_NULL_BLKNO;
  cr_oh->opholder_crobj.np->cow_next = cr_oh->opholder_crobj.nodenum;
  cr_oh->opholder_crobj.np->cow_prev = cr_oh->opholder_crobj.nodenum;
  if (cr_oh->opholder_crobj.pre_exle) {
    cr_oh->opholder_crobj.np->prealloc_ex =
      cr_oh->opholder_crobj.pre_exle->range;
  }
  else {
    cr_oh->opholder_crobj.np->prealloc_ex.first = NASD_NULL_BLKNO;
    cr_oh->opholder_crobj.np->prealloc_ex.last = NASD_NULL_BLKNO;
  }
  cr_oh->opholder_crobj.np->generation =
    cr_oh->opholder_crobj.part->generation;
  cr_oh->opholder_crobj.gen = cr_oh->opholder_crobj.np->generation;
  cr_oh->opholder_crobj.part->generation++;
  if ((cr_oh->opholder_crobj.part->generation&NASD_OD_GEN_MASK) == 0)
    cr_oh->opholder_crobj.part->generation = 1;
  cr_oh->opholder_crobj.np->nodenum = cr_oh->opholder_crobj.nodenum;
  cr_oh->opholder_crobj.np->last_block =
    cr_oh->opholder_crobj.n_exle->range.last;
  bzero((char *)cr_oh->opholder_crobj.np->ptrs,
    sizeof(cr_oh->opholder_crobj.np->ptrs));
#if NASD_OD_PTR_SPARE_BYTES > 0
  bzero((char *)cr_oh->opholder_crobj.np->spare_ptrs,
    sizeof(cr_oh->opholder_crobj.np->spare_ptrs));
#endif /* NASD_OD_PTR_SPARE_BYTES > 0 */
  bzero((char *)cr_oh->opholder_crobj.np->spare,
    sizeof(cr_oh->opholder_crobj.np->spare));
  cr_oh->opholder_crobj.ne->data_flags &= ~(NASD_CD_INVALID|NASD_CD_NZ);
#if NASD_FLUSH_BLOCKS > 0
  cr_oh->opholder_crobj.ne->data_flags |= NASD_CD_BUSY;
#endif /* NASD_FLUSH_BLOCKS > 0 */
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.ne);
#if NASD_FLUSH_BLOCKS > 0
#if NASD_FLUSH_BLOCKS_SYNC > 0
  /* safe but slow version */
  nasd_od_io_flush_block(cr_oh->opholder_crobj.ne);
#else /* NASD_FLUSH_BLOCKS_SYNC > 0 */
  /* fast but potentially unsafe in event of poorly timed crash */
  nasd_od_io_flush_block_async(cr_oh->opholder_crobj.ne);
#endif /* NASD_FLUSH_BLOCKS_SYNC > 0 */
#endif /* NASD_FLUSH_BLOCKS > 0 */

  cr_oh->opholder_crobj.newid = cr_oh->opholder_crobj.ne->identifier;
  n64 = cr_oh->opholder_crobj.newid & nasd_int64cast(0xffffffff00000000);
  n64 >>= 32;
  cr_oh->opholder_crobj.hi32 = n64;

  /*
   * Update the node pagetable block
   */
  NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.npte);
  NASD_ODC_WLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte);
  nasd_odc_dirty_ent(cr_oh->opholder_crobj.npte);
  cr_oh->opholder_crobj.npte->data.pte[cr_oh->opholder_crobj.npti].blkno =
    cr_oh->opholder_crobj.n_exle->range.first;
  cr_oh->opholder_crobj.npte->data.pte[cr_oh->opholder_crobj.npti].highbits =
    cr_oh->opholder_crobj.hi32;
  NASD_ODC_WUNLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte);
#if NASD_FLUSH_BLOCKS > 0
  cr_oh->opholder_crobj.npte->data_flags |= NASD_CD_BUSY;
#endif /* NASD_FLUSH_BLOCKS > 0 */
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.npte);
#if NASD_FLUSH_BLOCKS > 0
  nasd_od_io_flush_block_async(cr_oh->opholder_crobj.npte);
#endif /* NASD_FLUSH_BLOCKS > 0 */
  nasd_odc_block_release(cr_oh->opholder_crobj.npte);

  /* and duplicate */
  NASD_ODC_LOCK_BLOCK(cr_oh->opholder_crobj.npte2);
  NASD_ODC_WLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte2);
  nasd_odc_dirty_ent(cr_oh->opholder_crobj.npte2);
  cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].blkno =
    cr_oh->opholder_crobj.n_exle->range.first;
  cr_oh->opholder_crobj.npte2->data.pte[cr_oh->opholder_crobj.npti].highbits =
    cr_oh->opholder_crobj.hi32;
  NASD_ODC_WUNLOCK_BLOCK_DATA(cr_oh->opholder_crobj.npte2);
#if NASD_FLUSH_BLOCKS > 0
  cr_oh->opholder_crobj.npte2->data_flags |= NASD_CD_BUSY;
#endif /* NASD_FLUSH_BLOCKS > 0 */
  NASD_ODC_UNLOCK_BLOCK(cr_oh->opholder_crobj.npte2);
#if NASD_FLUSH_BLOCKS > 0
  /*
   * Garth says it's okay to skip doing this here in the
   * interest of performance. The theory is, we really have
   * that NVRAM log where we've marked this change until
   * we commit the block for real.
   */
  nasd_od_io_flush_block_async(cr_oh->opholder_crobj.npte2);
#endif /* NASD_FLUSH_BLOCKS > 0 */
  nasd_odc_block_release(cr_oh->opholder_crobj.npte2);

  /* identify the node block as used */
  cr_oh->opholder_crobj.part->last_cr_del = cr_oh->opholder_crobj.cur_time;
  cr_oh->opholder_crobj.part->num_obj++;

  nasd_part_modified(partnum);

  if (part_lock_held == 0) {
    NASD_ODC_ICPART_UNLOCK_WRITE(cr_oh->opholder_crobj.icp);
  }

  /* partnum argument ignored in this call */
  rc = nasd_odc_ref_ranges(partnum, cr_oh->opholder_crobj.n_exle, 1, NULL,
    NASD_ODC_REF_NOFLAGS);
  if (rc != NASD_SUCCESS)
    NASD_PANIC();
  nasd_odc_release_extent_list(cr_oh->opholder_crobj.n_exle);

  if (cr_oh->opholder_crobj.pre_exle) {
    /* partnum argument ignored in this call */
    rc = nasd_odc_ref_ranges(partnum, cr_oh->opholder_crobj.pre_exle, 1,
      NULL, NASD_ODC_REF_NOFLAGS);
    if (rc != NASD_SUCCESS)
      NASD_PANIC();
    nasd_odc_release_extent_list(cr_oh->opholder_crobj.pre_exle);
  }

  nasd_odc_get_attr_from_ent(cr_oh->opholder_crobj.ne, out_attr, 0);
  NASD_BROADCAST_COND(cr_oh->opholder_crobj.ne->cond);
  nasd_odc_block_release(cr_oh->opholder_crobj.ne);
  *out_id = cr_oh->opholder_crobj.newid;

  NASD_DRIVE_FREE_OPHOLDER(cr_oh);
  return(NASD_SUCCESS);
} /* end nasd_obj_create() */

/*
 * XXX eventually, this should be made to do lvl-2 npt lookups
 *
 * Caller expected to hold partition read or write lock
 */
nasd_status_t
nasd_odc_nodenum_to_blknum(
  int              partnum,
  nasd_nodenum_t   in_nodenum,
  nasd_blkno_t     lvl2_hint,
  nasd_blkno_t    *blkp)
{
  nasd_blkno_t npt_bn, npt_bn2;
  nasd_odc_ent_t *npt_ent;
  nasd_nodenum_t nodenum;
  nasd_od_part_t *part;
  nasd_blkcnt_t nn;
  nasd_status_t rc;
  int npti;

  *blkp = 0;

  if (in_nodenum < nasd_reserved_nodes) {
    /* XXX special nodes */
    return(NASD_OP_NOT_SUPPORTED);
  }

  nodenum = in_nodenum - nasd_reserved_nodes;

  nn = nasd_odc_state->disk->npt_ext.last
    - nasd_odc_state->disk->npt_ext.first + 1;
  if (nodenum >= (nn * NASD_OD_NODES_PER_NPT_BLOCK)) {
    /* don't support n-level yet */
    return(NASD_BAD_IDENTIFIER);
  }

  part = &PART(partnum);
  if (NASD_OD_INVALID_PART(part)) {
    return(NASD_BAD_PARTITION);
  }

  /* compute npt locations (primary and copy) */
  npt_bn = (nodenum / NASD_OD_NODES_PER_NPT_BLOCK);
  npt_bn2 = nasd_odc_state->disk->npt2_ext.first + npt_bn;
  npt_bn += nasd_odc_state->disk->npt_ext.first;
  npti = nodenum % NASD_OD_NODES_PER_NPT_BLOCK;
  /*
   * npt_bn is blocknumber of nodepagetable ent for this node
   * npt_bn2 is its duplicate
   * First, check for duplicate already in-core. If not, get
   * original, whether in or out of core.
   */
  rc = nasd_odc_block_get(NULL, npt_bn2, 0, &npt_ent, NASD_ID_NULL, 0,
    NASD_ODC_T_NPT2, NULL);
  if (rc != NASD_SUCCESS) {
    /*
     * Secondary copy not in-core. Get primary copy. If it's
     * not already here, read it in.
     */
    rc = nasd_odc_block_get(NULL, npt_bn,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
      &npt_ent, NASD_ID_NULL, 0, NASD_ODC_T_NPT1, NULL);
  }
  if (rc == NASD_SUCCESS) {
    NASD_ODC_LOCK_BLOCK(npt_ent);
    nasd_odc_wait_not_busy_invalid(npt_ent);
    NASD_ODC_UNLOCK_BLOCK(npt_ent);
    NASD_ODC_RLOCK_BLOCK_DATA(npt_ent);
    *blkp = npt_ent->data.pte[npti].blkno;
    NASD_ODC_RUNLOCK_BLOCK_DATA(npt_ent);
    nasd_odc_block_release(npt_ent);
  }

  return(rc);
}

/*
 * XXX eventually, this should be made to do lvl-2 npt lookups
 */
nasd_status_t
nasd_odc_node_get(
  nasd_identifier_t    nid,
  int                  partnum,
  nasd_nodenum_t       in_nodenum,
  nasd_blkno_t         lvl2_hint,
  nasd_odc_ent_t     **entp)
{
  nasd_odc_ent_t *npt_ent, *ent, ich, *il;
  nasd_blkno_t npt_bn, npt_bn2, nd_bn;
  nasd_odc_icpart_t *icp;
  nasd_nodenum_t nodenum;
  nasd_od_part_t *part;
  nasd_timespec_t ts;
  nasd_blkcnt_t nn;
  nasd_status_t rc;
  nasd_timer_t tm;
  int npti, cr;

  NASD_TM_START(&tm);

  if (!NASD_OD_PARTNUM_VALID(partnum))
    return(NASD_BAD_PARTITION);

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];
  ent = NULL;

  if (in_nodenum < nasd_reserved_nodes) {
    /* XXX special nodes */
    return(NASD_OP_NOT_SUPPORTED);
  }

  nodenum = in_nodenum - nasd_reserved_nodes;

  nn = nasd_odc_state->disk->npt_ext.last
    - nasd_odc_state->disk->npt_ext.first + 1;
  if (nodenum >= (nn * NASD_OD_NODES_PER_NPT_BLOCK)) {
    /* don't support n-level yet */
    return(NASD_BAD_IDENTIFIER);
  }

  NASD_ODC_ICPART_LOCK_READ(icp);

  if (NASD_OD_INVALID_PART(part)) {
    NASD_ODC_ICPART_UNLOCK_READ(icp);
    return(NASD_BAD_PARTITION);
  }

  /* compute npt locations (primary and copy) */
  npt_bn = (nodenum / NASD_OD_NODES_PER_NPT_BLOCK);
  npt_bn2 = nasd_odc_state->disk->npt2_ext.first + npt_bn;
  npt_bn += nasd_odc_state->disk->npt_ext.first;
  npti = nodenum % NASD_OD_NODES_PER_NPT_BLOCK;

  NASD_TM_STOP(&tm);
  NASD_TM_ELAPSED_TS(&tm, &ts);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.node_compute_time, &ts);

  /*
   * npt_bn is blocknumber of nodepagetable ent for this node
   * npt_bn2 is its duplicate
   * First, check for duplicate already in-core. If not, get
   * original, whether in or out of core.
   */
  rc = nasd_odc_block_get(NULL, npt_bn2, 0, &npt_ent, NASD_ID_NULL, 0,
    NASD_ODC_T_NPT2, NULL);
  if (rc != NASD_SUCCESS) {
    /*
     * Secondary copy not in-core. Get primary copy. If it's
     * not already here, read it in.
     */
    rc = nasd_odc_block_get(NULL, npt_bn,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
      &npt_ent, NASD_ID_NULL, 0, NASD_ODC_T_NPT1, NULL);
  }

  NASD_TM_STOP(&tm);
  NASD_TM_ELAPSED_TS(&tm, &ts);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.node_hash_get_time, &ts);
  NASD_TM_START(&tm);

  if (rc == NASD_SUCCESS) {
    NASD_ODC_LOCK_BLOCK(npt_ent);
    nasd_odc_wait_not_busy_invalid(npt_ent);
    NASD_ODC_UNLOCK_BLOCK(npt_ent);
  }
  NASD_ODC_RLOCK_BLOCK_DATA(npt_ent);
  nd_bn = npt_ent->data.pte[npti].blkno;
  NASD_ODC_RUNLOCK_BLOCK_DATA(npt_ent);
  nasd_odc_block_release(npt_ent);

  /*
   * Separate phase 1 and 2 so we can get a ref on the block
   * (via lookup) before trying to take the block lock- this
   * allows us to release the partition lock in the mean time,
   * and avoid deadlock with someone taking a partition lock
   * in the bmap fault path while holding the block lock.
   */
  if (nd_bn) {
    rc = nasd_odc_block_get_part1(NULL, nd_bn,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
      &ent, nid, 0, NASD_ODC_T_NODE, NULL, &cr);
    if (rc == NASD_OBJ_REMOVING) {
      /* delete begun, id no longer valid */
      rc = NASD_BAD_IDENTIFIER;
      ent = NULL;
    }
    NASD_ODC_ICPART_UNLOCK_READ(icp);
    if (rc == NASD_SUCCESS) {
      ich.inext = ich.iprev = &ich;
#if NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 0
      rc = nasd_odc_block_get_part2(NULL, nd_bn,
        NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
        &ent, nid, 0, NASD_ODC_T_NODE, NULL, &cr);
#endif /* NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 0 */
#if NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 1
      rc = nasd_odc_block_get_part2(NULL, nd_bn,
        NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
        &ent, nid, 0, NASD_ODC_T_NODE, &ich, &cr);
#endif /* NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 1 */
#if NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 2
      if ((nd_bn-1) % nasd_od_region_blocks == 0) {
        rc = nasd_odc_block_get_part2(NULL, nd_bn,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
          &ent, nid, 0, NASD_ODC_T_NODE, NULL, &cr);
      }
      else {
        rc = nasd_odc_block_get_part2(NULL, nd_bn,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
          &ent, nid, 0, NASD_ODC_T_NODE, &ich, &cr);
      }
#endif /* NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY == 2 */
#if NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY > 2
      NASD_PANIC();
#endif /* NASD_DRIVE_INODE_FETCH_CLUSTER_POLICY > 2 */
      if (rc) {
        nasd_odc_block_release(ent);
        ent = NULL;
      }
      else if (ich.inext != &ich) {
        /*
         * Actually issue I/Os
         */
        il = ich.inext;
        /* disconnect I/O chain */
        ich.iprev->inext = NULL;
        ich.inext->iprev = NULL;
        nasd_od_io_enq(il, NASD_U_READ, NASD_IO_PRI_HI);
      }
    }
    else {
      ent = NULL;
    }
  }
  else {
    /* no such object */
    NASD_ODC_ICPART_UNLOCK_READ(icp);
    rc = NASD_BAD_IDENTIFIER;
    ent = NULL;
  }

  NASD_TM_STOP(&tm);
  NASD_TM_ELAPSED_TS(&tm, &ts);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.node_block_get_time, &ts);

  *entp = ent;

  if (ent) {
    NASD_ODC_CHECK_NODE_ENT(ent);
  }

  return(rc);
}

nasd_status_t
nasd_odc_node_get_from_id(
  int                  in_partnum,
  nasd_identifier_t    nid,
  nasd_odc_ent_t     **entp)
{
  nasd_nodenum_t nodenum;
  nasd_blkno_t lvl2_hint;
  nasd_generation_t gen;
  nasd_od_node_t *np;
  nasd_odc_ent_t *ne;
  nasd_timespec_t ts;
  nasd_status_t rc;
  nasd_timer_t tm;
  int partnum;

  NASD_TM_START(&tm);

  *entp = NULL;

  rc = nasd_od_decompose_id(nid, &partnum, &nodenum, &lvl2_hint, &gen);
  if (rc) {
    return(rc);
  }
  rc = nasd_odc_node_get(nid, in_partnum, nodenum, lvl2_hint, &ne);
  if (rc) {
    return(rc);
  }

  NASD_TM_STOP(&tm);
  NASD_TM_ELAPSED_TS(&tm, &ts);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.node_xlate_time, &ts);

  NASD_TM_START(&tm);

  NASD_ODC_LOCK_BLOCK(ne);
  nasd_odc_wait_not_busy_invalid(ne);
  NASD_ODC_UNLOCK_BLOCK(ne);
  np = ne->data.node;
  if (gen != (np->generation&NASD_OD_GEN_MASK)) {
    nasd_odc_block_release(ne);
    rc = NASD_BAD_IDENTIFIER;
  }
  else if ((in_partnum != partnum) || (in_partnum != NASD_OD_PARTNUM(np))) {
    nasd_odc_block_release(ne);
    rc = NASD_BAD_IDENTIFIER;
  }
  else {
    *entp = ne;
    rc = NASD_SUCCESS;
  }

  NASD_TM_STOP(&tm);
  NASD_TM_ELAPSED_TS(&tm, &ts);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.node_wait_check_time, &ts);

  return(rc);
}

nasd_status_t
nasd_obj_getattr(
  int                 partnum,
  nasd_identifier_t   nid,
  nasd_attribute_t   *attrp)
{
  nasd_nodenum_t nodenum;
  nasd_odc_ent_t *ne;
  nasd_status_t rc;

  rc = nasd_od_decompose_control(nid, &nodenum);
  if (rc == NASD_SUCCESS) {
    rc = nasd_obj_control_getattr(partnum, nodenum, attrp);
    return(rc);
  }

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc)
    return(rc);

  NASD_ODC_LOCK_BLOCK(ne);
  nasd_odc_wait_not_busy_invalid(ne);
  NASD_ODC_UNLOCK_BLOCK(ne);
  nasd_odc_get_attr_from_ent(ne, attrp, 0);

  nasd_odc_block_release(ne);

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_obj_read_simple(
  int                       partnum,
  nasd_identifier_t         nid,
  nasd_offset_t             offset,
  nasd_len_t                in_len,
  nasd_uint64              *bms_targ_p,
  int                       is_read2,
  int                       is_remote, 
  nasd_procpipe_t          *byte_pipe,
  nasd_len_t               *out_len,
  nasd_security_context_t  *contextp)
{
  nasd_odc_ent_t *ne, *ents[NASD_DRIVE_MAX_BLOCKMAPCHUNK], ich, *il;
  nasd_blkrec_t blks[NASD_DRIVE_MAX_BLOCKMAPCHUNK];
  nasd_offset_t lastoff, curoff;
  nasd_delaycounter_t delayer;
  int i, d, t, j, first_null;
  nasd_nodenum_t nodenum;
  nasd_offset_t get_off;
  nasd_len_t remain, l;
  nasd_oblkcnt_t blcnt;
  nasd_uint64 tr, bus;
  nasd_oblkno_t bloff;
  nasd_od_node_t *np;
  nasd_status_t rc;
  nasd_len_t len;
  nasd_byte_t *sendp;
  int ne_rlock_held, need_io;
  nasd_timespec_t pipe_stall_tm, stall_tm, ilook_tm, bmap_tm;
  nasd_timespec_t setup_tm, lget_tm, pget_tm, issue_tm, core_tm;
  nasd_timer_t tm, tm_core;
  nasd_uint64 ba;

#if 0
nasd_printf("READ 0x%" NASD_ID_FMT "  offset %5lu  len %5u\n", nid, (unsigned long)offset, (unsigned)in_len);
#endif
  NASD_TM_START(&tm_core);
  NASD_TIMESPEC_ZERO(pipe_stall_tm);
  NASD_TIMESPEC_ZERO(stall_tm);
  NASD_TIMESPEC_ZERO(ilook_tm);
  NASD_TIMESPEC_ZERO(bmap_tm);
  NASD_TIMESPEC_ZERO(lget_tm);
  NASD_TIMESPEC_ZERO(pget_tm);
  NASD_TIMESPEC_ZERO(setup_tm);
  NASD_TIMESPEC_ZERO(issue_tm);
  NASD_TIMESPEC_ZERO(core_tm);
  ba = 0;

  ne_rlock_held = 0;
  need_io = 1;
  *out_len = 0;
#if NASD_OD_EXT_PTR > 0
  bzero( blks, NASD_DRIVE_MAX_BLOCKMAPCHUNK*sizeof(nasd_blkrec_t));
#endif /* NASD_OD_EXT_PTR > 0 */
  if (nasd_od_obj_blockmapchunk > NASD_DRIVE_MAX_BLOCKMAPCHUNK) {
    nasd_printf("nasd_obj_read_simple(): nasd_od_obj_blockmapchunk=%d exceeding maximum %d\n",
      nasd_od_obj_blockmapchunk, NASD_DRIVE_MAX_BLOCKMAPCHUNK);
    nasd_od_obj_blockmapchunk = NASD_DRIVE_MAX_BLOCKMAPCHUNK;
  }

  if (bms_targ_p) {
    tr = *bms_targ_p;
    bus = (NASD_OD_BASIC_BLOCKSIZE*1000)/tr;
  }
  else {
    tr = bus = 0;
  }
#if DBG_READ_SIMPLE > 0
  if (bms_targ_p) {
    nasd_printf("read_simple: target rate %lu bytes/ms %lu us/block\n", tr, bus);
  }
#endif /* DBG_READ_SIMPLE > 0 */

  NASD_TM_START(&tm);
  rc = nasd_od_decompose_control(nid, &nodenum);
  if (rc == NASD_SUCCESS) {
    NASD_TM_STOP_ACCUM_TS(&tm,&ilook_tm);
    rc = nasd_obj_control_read_simple(partnum, nodenum, offset, is_read2,
      in_len, byte_pipe, out_len);
    goto done_read;
  }

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  NASD_TM_STOP_ACCUM_TS(&tm,&ilook_tm);
  if (rc) {
    goto done_read;
  }
  
  NASD_TM_START(&tm);

  NASD_ODC_RLOCK_BLOCK_DATA(ne);
  ne_rlock_held = 1;

  NASD_ODC_LOCK_BLOCK(ne);

  np = ne->data.node;

  len = in_len;
  if ((offset + len) > np->object_len) {
    if (offset >= np->object_len) {
      /* read starts past EOF - return no bytes */
#if DBG_READ_SIMPLE > 0
      nasd_printf("read starts past EOF, skipping\n");
#endif /* DBG_READ_SIMPLE > 0 */
      rc = NASD_SUCCESS;
      NASD_ODC_UNLOCK_BLOCK(ne);
      goto get_out;
    }
    len = np->object_len - offset;
    if(contextp)
      contextp->remain = len;
#if DBG_READ_SIMPLE > 0
    nasd_printf("read past EOF shortened from %d to %d\n", in_len, len);
#endif /* DBG_READ_SIMPLE > 0 */
  }
  remain = len;

  NASD_ODC_UNLOCK_BLOCK(ne);

  bloff = offset / NASD_OD_BASIC_BLOCKSIZE;
  curoff = bloff * NASD_OD_BASIC_BLOCKSIZE;
  lastoff = offset;
  rc = NASD_SUCCESS;
  
  NASD_TM_STOP_ACCUM_TS(&tm,&setup_tm);
  
  if (np->flags&NASD_ND_ATOMIC) {
    /* Disallowing this because it's utterly untested */
    if (is_remote) {
      nasd_printf("DRIVE: Remote execution on atomic object unsupported\n"); /*DB*/
      rc = NASD_OP_NOT_SUPPORTED;
      need_io=0;
    } else {
      NASD_ASSERT(np->object_len <= NASD_ND_ATOMIC_SIZE);
      d = offset;
      l = len;
      sendp = (nasd_byte_t *)&np->ptrs[d];
      NASD_TM_START(&tm);
      rc = byte_pipe->push(byte_pipe->state, sendp, l, NULL, NULL, NULL);
      NASD_TM_STOP_ACCUM_TS(&tm,&pipe_stall_tm);
      ba += l;
      lastoff += l;
      *out_len += l;
      need_io = 0;
    }
  }
  
  if (need_io && len) {
    if ((is_remote) && (!ne->invocation)) {
      rc = NASD_REMOTE_UNATTACHED;
      goto done_read;
    }
    if (is_remote) {
      /* We can't set this up in nasd_obj_remote_invoke because we don't have the ne yet */
      ne->invocation->original=byte_pipe;
      ne->invocation->nid = nid ;
      ne->invocation->partnum = partnum;
      byte_pipe=&(ne->invocation->pipe);
      ne->invocation->f->call_start(ne);
    }
    do {
      if (is_remote && ne->invocation) ne->invocation->offset = offset;
      blcnt = (remain + NASD_OD_BASIC_BLOCKSIZE - 1) / NASD_OD_BASIC_BLOCKSIZE;
      t = NASD_MIN(blcnt, nasd_od_obj_blockmapchunk);
      
#if DBG_READ_SIMPLE > 0
      nasd_printf("in_len=%d len=%d remain=%d blcnt=%d t=%d\n", in_len, len, remain, blcnt, t);
#endif /* DBG_READ_SIMPLE > 0 */

      ich.inext = ich.iprev = &ich;

      /*
       * This gets a bit complicated here. First, we look in the
       * cache for our blocks by logical address in the object.
       * Those that we don't find, we bmap and look again for
       * by physical number.
       */
      get_off = curoff;
      first_null = t;
      NASD_TM_START(&tm);
      for(i=0;i<t;i++) {
        rc = nasd_odc_block_get_logical(ne,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
          &ents[i], nid, get_off, NASD_ODC_T_DATA, &ich);
        if (rc || (ents[i] == NULL)) {
          ents[i] = NULL;
          first_null = NASD_MIN(first_null, i);
        }
        get_off += NASD_OD_BASIC_BLOCKSIZE;
      }
      NASD_TM_STOP_ACCUM_TS(&tm,&lget_tm);


      /*
       * All NULL-valued entries need to be bmap'd and
       * fetched.
       */
      while(first_null < t) {
        for(j=i=first_null;i<t;i++) {
          if (ents[i])
            break;
          j = i;
        }
        /* bmap i..j */
        NASD_TM_START(&tm);
        rc = nasd_od_bmap(ne, bloff + first_null, j-first_null+1, 0, 0,
          partnum, 0, &blks[first_null], NULL, NULL, NULL);
        NASD_TM_STOP_ACCUM_TS(&tm,&bmap_tm);
        if (rc) {  
          nasd_od_bmap_release(blks, t, 0, 0);
          for(j=0;j<i;j++)
            nasd_odc_block_release(ents[j]);
          goto get_out;
        }

        /* fetch blocks */
        get_off = curoff + (first_null * NASD_OD_BASIC_BLOCKSIZE);
        NASD_TM_START(&tm);
        for(i=first_null;i<=j;i++) {
          /* fetch iff non-zero-fill block */
          if (blks[i].blkno) {
            rc = nasd_odc_block_get(ne, blks[i].blkno,
              NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
              &ents[i], nid, get_off, NASD_ODC_T_DATA, &ich);
            if (rc) {
              nasd_od_bmap_release(blks, t, 0, 0);
              for(j=0;j<i;j++)
                nasd_odc_block_release(ents[j]);
              NASD_TM_STOP_ACCUM_TS(&tm,&pget_tm);
              goto get_out;
            }
            NASD_ASSERT(ents[i]->offset == get_off);
            get_off += NASD_OD_BASIC_BLOCKSIZE;
          }
        }
        NASD_TM_STOP_ACCUM_TS(&tm,&pget_tm);

        /* set first_null to next null block */
        for(first_null=j+1;first_null<t;first_null++) {
          if (ents[first_null] == NULL)
            break;
        }
      }

      if (ich.inext != &ich) {
        NASD_TM_START(&tm);
        /*
         * Actually issue I/Os
         */
        il = ich.inext;
        /* disconnect I/O chain */
        ich.iprev->inext = NULL;
        ich.inext->iprev = NULL;
        nasd_od_io_enq(il, NASD_U_READ, NASD_IO_PRI_HI);
        NASD_TM_STOP_ACCUM_TS(&tm,&issue_tm);
      }
      for(i=0;i<t;i++) {
        l = NASD_OD_BASIC_BLOCKSIZE;
        d = lastoff - curoff;
        if (d)
          l -= d;
        l = NASD_MIN(remain,l);
        /* we must ship l bytes */
        if (ents[i]) {
          /* ship l bytes from block ents[i] */
          if (tr) {
            NASD_BEGIN_DELAYCNT(&delayer);
          }
          NASD_ODC_RLOCK_BLOCK_DATA(ents[i]);
          NASD_TM_START(&tm);
          NASD_ODC_LOCK_BLOCK(ents[i]);
          nasd_odc_wait_not_busy_invalid(ents[i]);
          NASD_ODC_UNLOCK_BLOCK(ents[i]);
          NASD_TM_STOP_ACCUM_TS(&tm,&stall_tm);

          /* Default to sending the data buffer unless we decide otherwise */
          sendp=(nasd_byte_t *)&((char *)ents[i]->data.buf)[d];

          NASD_ASSERT (l <= 8192);
          NASD_ASSERT (l > 0);
          NASD_ASSERT (d >= 0);
          NASD_ASSERT (d < 8192-1);
          NASD_ASSERT (d+l <= 8192);

          NASD_TM_START(&tm);
          rc = byte_pipe->push(byte_pipe->state, sendp, l, NULL, NULL, NULL);
          NASD_TM_STOP_ACCUM_TS(&tm,&pipe_stall_tm);
          ba += l;

          NASD_ODC_RUNLOCK_BLOCK_DATA(ents[i]);
          nasd_odc_block_release(ents[i]);
          if (tr) {
            NASD_DELAY_FROM(&delayer,bus);
          }
        }
        else { 
          /* zero-fill block */
          sendp=(nasd_byte_t *)nasd_odc_zeroblk;
          NASD_TM_START(&tm);
          rc = byte_pipe->push(byte_pipe->state, sendp, l, NULL, NULL, NULL);
          NASD_TM_STOP_ACCUM_TS(&tm,&pipe_stall_tm);
          ba += l;
        }
        if (rc) {
          nasd_od_bmap_release(blks, t, 0 , 0);
          goto get_out;
        }
        remain -= l;
        curoff += NASD_OD_BASIC_BLOCKSIZE;
        lastoff += l;
        *out_len += l;
        bloff++;
        blcnt--;
      }
      nasd_od_bmap_release(blks, t, 0, 0);
    } while(remain);
    rc = NASD_SUCCESS;
  }
  else {
    rc = NASD_SUCCESS;
  }

get_out:

  if (is_remote) {
    ne->invocation->f->call_finish(ne,out_len);
  }
  if (ne_rlock_held) {
    NASD_ODC_RUNLOCK_BLOCK_DATA(ne);
    ne_rlock_held = 0;
  }

  /*
   * Caller terminates pipe.
   */
  /* used to unlock ne here */

  nasd_odc_block_release(ne);

done_read:
  NASD_TM_STOP_ACCUM_TS(&tm_core,&core_tm);

  if (is_read2) {
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_pipe_stall_time, &pipe_stall_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_stall_time, &stall_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_bmap_time, &bmap_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_ilookup_time, &ilook_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_lget_time, &lget_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_pget_time, &pget_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_setup_time, &setup_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_issue_time, &issue_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read2_core_time, &core_tm);
    NASD_ATOMIC_ADD64(&nasd_drive_cache_stats.read2_bytes, ba);
  }
  else {
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_pipe_stall_time, &pipe_stall_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_stall_time, &stall_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_bmap_time, &bmap_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_ilookup_time, &ilook_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_lget_time, &lget_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_pget_time, &pget_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_setup_time, &setup_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_issue_time, &issue_tm);
    NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.read_core_time, &core_tm);
    NASD_ATOMIC_ADD64(&nasd_drive_cache_stats.read_bytes, ba);
  }

  return(rc);
}

nasd_status_t
nasd_obj_write_simple(
  int                       partnum,
  nasd_identifier_t         nid,
  nasd_offset_t             offset,
  nasd_len_t                len,
  nasd_procpipe_t          *byte_pipe,
  nasd_security_context_t  *contextp,
  nasd_len_t               *out_len)
{
  nasd_odc_ent_t *ne, *ents[NASD_DRIVE_MAX_BLOCKMAPCHUNK];
  nasd_offset_t lastoff, curoff, get_off;
  nasd_blkrec_t blks[NASD_DRIVE_MAX_BLOCKMAPCHUNK];
  int i, d, d2, d3, r, t, j, g, dg;
  nasd_timespec_t cur_time;
  nasd_nodenum_t nodenum;
  nasd_len_t remain, l;
  nasd_oblkcnt_t blcnt;
  nasd_oblkno_t bloff;
  nasd_len_t count;
  nasd_od_node_t *np;
  nasd_status_t rc;
  int ne_wlock_held, need_io;
  nasd_byte_t *digestp;
  int digest_valid;
  nasd_odc_oq_t commit_queue;
  nasd_uint16 protection;
  nasd_timespec_t pipe_stall_tm, stall_tm, ilook_tm, bmap_tm;
  nasd_timer_t tm;
  nasd_uint64 ba;

  NASD_TIMESPEC_ZERO(pipe_stall_tm);
  NASD_TIMESPEC_ZERO(stall_tm);
  NASD_TIMESPEC_ZERO(ilook_tm);
  NASD_TIMESPEC_ZERO(bmap_tm);
  ba = 0;

  if(contextp) {
    protection = contextp->protection;
    if(protection & NASD_INTEGRITY_DATA) {
      rc = nasd_queue_simple_init(&commit_queue);
      if(rc)
        goto done_write;
    }
  } else {
    protection = NASD_NO_PROTECTION;
  }

#if NASD_OD_EXT_PTR > 0 
  bzero(blks, NASD_DRIVE_MAX_BLOCKMAPCHUNK*sizeof(nasd_blkrec_t));
#endif /* NASD_OD_EXT_PTR > 0 */
  if (nasd_od_obj_blockmapchunk > NASD_DRIVE_MAX_BLOCKMAPCHUNK) {
    nasd_printf("nasd_obj_write_simple(): nasd_od_obj_blockmapchunk=%d exceeding maximum %d\n",
      nasd_od_obj_blockmapchunk, NASD_DRIVE_MAX_BLOCKMAPCHUNK);
    nasd_od_obj_blockmapchunk = NASD_DRIVE_MAX_BLOCKMAPCHUNK;
  }

  ne_wlock_held = 0;
  need_io = 1;
  remain = len;

  NASD_TM_START(&tm);
  rc = nasd_od_decompose_control(nid, &nodenum);
  if (rc == NASD_SUCCESS) {
    NASD_TM_STOP_ACCUM_TS(&tm,&ilook_tm);
    rc = nasd_obj_control_write_simple(partnum, nodenum, offset, len,
      byte_pipe, out_len);
    goto done_write;
  }

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  NASD_TM_STOP_ACCUM_TS(&tm,&ilook_tm);
  if (rc) {
    goto done_write;
  }

  /* used to NASD_ODC_LOCK_BLOCK(ne) here */

  np = ne->data.node;

  bloff = offset / NASD_OD_BASIC_BLOCKSIZE;
  curoff = bloff * NASD_OD_BASIC_BLOCKSIZE;
  lastoff = offset;

  NASD_ODC_WLOCK_BLOCK_DATA(ne);
  ne_wlock_held = 1;
  /* rc is NASD_SUCCESS from above */

  /*
   * First, check to see if we should convert to or maintain
   * atomic mode.
   */
  if (len && ((offset+len) <= NASD_ND_ATOMIC_SIZE) &&
    ((np->flags&NASD_ND_ATOMIC) || (np->object_len == 0)))
  {
    /*
     * Object is either atomic-mode or zero-length, and our
     * write fits inside.
     */
    d = np->object_len;
    d2 = offset;
    if (!(np->flags&NASD_ND_ATOMIC)) {
      np->flags |= NASD_ND_ATOMIC;
      if (offset) {
        /* zero out bits that aren't overwritten */
        bzero((char *)np->ptrs, offset);
      }
    }
    else {
      if (offset > np->object_len) {
        /* zero out bits that aren't overwritten */
        bzero((char *)&np->ptrs[d], d2-d);
      }
    }
    /*
     * Receive bits into block
     */
    r = len;
    g = 0;

    NASD_TM_START(&tm);
    while(r) {
      rc = byte_pipe->pull(byte_pipe->state, (nasd_byte_t *)&np->ptrs[d2+g],
                           r, &count, NULL, NULL, NULL, NULL);
      if (rc)
        break;
      r -= count;
      g += count;
    }
    NASD_TM_STOP_ACCUM_TS(&tm,&pipe_stall_tm);
    ba += g;
    dg = g;

    if (rc) {
      need_io = 0;
      lastoff = 0;
      goto get_out;
    }
    rc = NASD_SUCCESS;
    *out_len += dg;
    need_io = 0;
    lastoff = offset + dg;
  }
  if (len && ((offset+len) > NASD_ND_ATOMIC_SIZE)
    && (np->flags&NASD_ND_ATOMIC))
  {
    /*
     * Object is currently atomic-mode, but needs to grow beyond that
     * limit.
     */
    rc = nasd_obj_deatomize(ne, partnum);
    /* error case caught below */
  }

  if (rc) {
    if (ne_wlock_held) {
      NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
    }
    nasd_odc_block_release(ne);
    goto done_write;
  }

  if (len && need_io) {
    do {
      rc = NASD_SUCCESS;
      blcnt = (remain + NASD_OD_BASIC_BLOCKSIZE - 1) / NASD_OD_BASIC_BLOCKSIZE;
      t = NASD_MIN(blcnt, nasd_od_obj_blockmapchunk);
      /*
       * Figure out where on disk the first set of datablocks lives
       */
      if (ne_wlock_held == 0) {
        NASD_ODC_WLOCK_BLOCK_DATA(ne);
        ne_wlock_held = 1;
      }
      NASD_TM_START(&tm);
      rc = nasd_od_bmap(ne, bloff, t, 0, 0, partnum,
        NASD_ODC_B_ALLOC|NASD_ODC_B_FAULT,
        blks, NULL, NULL, NULL);
      NASD_TM_STOP_ACCUM_TS(&tm,&bmap_tm);
     if (rc) {
        nasd_od_bmap_release(blks, t, 0, 0);
        goto get_out;
      }
      /*
       * Get pointers to the first set of pages to pull bits
       * into.
       */
      get_off = curoff;
      for(i=0;i<t;i++) {
        rc = nasd_odc_block_get(ne, blks[i].blkno,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_NOPRE,
          &ents[i], nid, get_off, NASD_ODC_T_DATA, NULL);
        if (rc) {
          db_printf(("ERROR %s from block_get %lu\n", nasd_error_string(rc), (u_long)blks[i].blkno));
          
          nasd_od_bmap_release(blks, t, 0, 0);
          
          for(j=0;j<i;j++)
            nasd_odc_block_release(ents[j]);
          /* XXX deallocate bogusly allocated blocks? */
          goto get_out;
        }
        NASD_ASSERT(ents[i]->offset == get_off);
        get_off += NASD_OD_BASIC_BLOCKSIZE;
      }
      NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
      ne_wlock_held = 0;
      /*
       * Pull bits off the wire into these pages
       */
      for(i=0;i<t;i++) {
        l = NASD_OD_BASIC_BLOCKSIZE;
        d = lastoff - curoff;
        if (d)
          l -= d;
        l = NASD_MIN(remain,l);
        d2 = l + d;
        d3 = NASD_OD_BASIC_BLOCKSIZE - d2;
        /* drop l bytes into block blks[i].blkno */
        NASD_ODC_WLOCK_BLOCK_DATA(ents[i]);
        NASD_ODC_LOCK_BLOCK(ents[i]);
        NASD_TM_START(&tm);
        nasd_odc_wait_not_busy(ents[i]);
        NASD_TM_STOP_ACCUM_TS(&tm,&stall_tm);
        if (ents[i]->data_flags&NASD_CD_NZ) {
          /*
           * Zero out the bits we aren't about to write
           */
          if (d)
            bzero((char *)ents[i]->data.buf, d);
          if (d3) {
            bzero((char *)&ents[i]->data.buf[d2], d3);
          }
          ents[i]->data_flags &= ~NASD_CD_NZ;
        }
        else {
          if (d || d3) {
            if (ents[i]->data_flags&NASD_CD_INVALID) {
              if (!(ents[i]->data_flags&NASD_CD_BUSY)) {
                /* must get partial block in-core */
                ents[i]->inext = ents[i]->iprev = NULL;
                ents[i]->iocb = NULL;
                ents[i]->iocb_arg = NULL;
                ents[i]->data_flags |= NASD_CD_BUSY;
                NASD_ASSERT(!(ents[i]->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
                NASD_ODC_UNLOCK_BLOCK(ents[i]);
                nasd_od_io_enq(ents[i], NASD_U_READ, NASD_IO_PRI_HI);
                NASD_ODC_LOCK_BLOCK(ents[i]);
              }
              NASD_TM_START(&tm);
              nasd_odc_wait_not_busy_invalid(ents[i]);
              NASD_TM_STOP_ACCUM_TS(&tm,&stall_tm);
            }
          }
        }
        nasd_odc_dirty_ent(ents[i]);

        NASD_ODC_UNLOCK_BLOCK(ents[i]);

        r = l;
        g = 0;

        if(protection & NASD_INTEGRITY_DATA) {
          NASD_ODC_Q_INS_NOLOCK(&commit_queue, ents[i], s);
#if NASD_OD_EXT_PTR > 0
          digestp = ents[i]->digest;
          digest_valid = 0;
#else /* NASD_OD_EXT_PTR > 0 */
          digestp = NULL;
#endif /* NASD_OD_EXT_PTR > 0 */
        } else {
          digestp = NULL;
        }

        while(r) {
          NASD_ASSERT(r <= 8192);
          NASD_ASSERT(d + g < 8192);
          NASD_TM_START(&tm);
          rc = byte_pipe->pull(byte_pipe->state,
                               (nasd_byte_t *)&((char *)ents[i]->data.buf)[d+g],
                               r, &count, digestp, &digest_valid,
                               nasd_od_commit_inflight,
                               &commit_queue);
          NASD_TM_STOP_ACCUM_TS(&tm,&pipe_stall_tm);
          if (rc)
            break;
          NASD_ASSERT(count <= r);
          r -= count;
          g += count;    
        }
        ba += g;

#if NASD_OD_EXT_PTR > 0
        if((protection & NASD_INTEGRITY_DATA) && digest_valid && digestp) {
          NASD_ODC_WLOCK_BLOCK_DATA(blks[i].odc_entp);
          NASD_ODC_LOCK_BLOCK(blks[i].odc_entp);
          bcopy(ents[i]->digest, blks[i].digest, sizeof(nasd_digest_t));
          nasd_odc_dirty_ent(blks[i].odc_entp);
          NASD_ODC_UNLOCK_BLOCK(blks[i].odc_entp);
          NASD_ODC_WUNLOCK_BLOCK_DATA(blks[i].odc_entp);
        }
#endif /* NASD_OD_EXT_PTR > 0 */

        NASD_ODC_LOCK_BLOCK(ents[i]);
        ents[i]->data_flags &= ~(NASD_CD_INVALID|NASD_CD_NZ);
        if(digest_valid)
          ents[i]->data_flags |= NASD_CD_PDIGEST;
        NASD_ODC_UNLOCK_BLOCK(ents[i]);
        
        /* if we're doing integrity, this will be done in the
           commit_inflight callback once the digests have been
           verified. */
        if(!(protection & NASD_INTEGRITY_DATA)) {
          NASD_ODC_WUNLOCK_BLOCK_DATA(ents[i]);
          nasd_odc_block_release(ents[i]);
        }

        NASD_BROADCAST_COND(ents[i]->cond);
        remain -= l;
        curoff += NASD_OD_BASIC_BLOCKSIZE;
        lastoff += l;
        *out_len += l;
        bloff++;
        blcnt--;
        if (rc) {
          /* XXX this seems wrong -- if there was an error from the pull
             (security or not), we should invalidate this cache block
             and roll back to previous block boundary.  -mju */
          nasd_od_bmap_release(blks, t, 0, 0);
          goto get_out;
        }
      }
      nasd_od_bmap_release(blks, t, 0, 0);
    } while(remain);
    need_io = 0;
  }

  if (need_io) {
    NASD_ASSERT(len == 0);
    /* nothing to do */
    need_io = 0;
  }

  np->object_len = NASD_MAX(np->object_len,lastoff);
  nasd_gettime(&cur_time);
  np->attr_modify_time = cur_time;
  np->object_modify_time = cur_time;
  np->fs_attr_modify_time = cur_time;
  np->fs_object_modify_time = cur_time;

  rc = NASD_SUCCESS;

  nasd_odc_dirty_ent(ne);

get_out:
  if (ne_wlock_held) {
    NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
    ne_wlock_held = 0;
  }

  /*
   * Caller terminates pipe.
   */

  /* used to NASD_ODC_UNLOCK_BLOCK(ne) here */

  nasd_odc_block_release(ne);

  if(protection & NASD_INTEGRITY_DATA) {
    /* if there are any blocks left on the commit queue, that means
       that they don't have valid digests, and thus their data cannot
       be trusted.  usually this will happen because we got a bad
       digest in the middle of the stream.  clean up. */
    while(NASD_ODC_Q_SIZE(&commit_queue) > 0) {
      NASD_ODC_Q_DEQ_TAIL_NOLOCK(&commit_queue, ne, s);
      NASD_ODC_LOCK_BLOCK(ne);
      /* XXX the cache will need to be fixed before this will work */
      nasd_printf("write_simple: invalidating nid 0x%" NASD_ID_FMT
                  " blkno %d offset %" NASD_64s_FMT "\n",
                  ne->identifier, ne->blkno, ne->offset);
      ne->data_flags |= (NASD_CD_INVALID | NASD_CD_SECURITY);
      ne->data_flags &= ~(NASD_CD_PDIGEST);
      NASD_ODC_UNLOCK_BLOCK(ne);
      NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
      NASD_BROADCAST_COND(ne->cond);
      nasd_odc_block_release(ne);
    }
  }

done_write:
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.write_pipe_stall_time, &pipe_stall_tm);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.write_stall_time, &stall_tm);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.write_bmap_time, &bmap_tm);
  NASD_ATOMIC_TIMESPEC_ADD(&nasd_drive_cache_stats.write_ilookup_time, &ilook_tm);
  NASD_ATOMIC_ADD64(&nasd_drive_cache_stats.write_bytes, ba);

  return(rc);
}

/* called by nasd_pipe_pull to signal that len bytes starting at
   start are now checked and can be made public. */
void
nasd_od_commit_inflight(void *commit_rock,
                        nasd_offset_t start,
                        nasd_len_t len)
{
  nasd_odc_oq_t *cq;
  nasd_odc_ent_t *ent;
  int blkoff;

  cq = (nasd_odc_oq_t *) commit_rock;
  
  do {
    NASD_ODC_Q_DEQ_TAIL_NOLOCK(cq, ent, s);

    /* head of the queue should be the correct block */
    NASD_ASSERT(ent->offset <= start);
    NASD_ASSERT(start <= ent->offset + NASD_OD_BASIC_BLOCKSIZE);
    NASD_ASSERT(ent->type == NASD_ODC_T_DATA);

    NASD_ODC_WUNLOCK_BLOCK_DATA(ent);
    nasd_odc_block_release(ent);

    blkoff = start % NASD_OD_BASIC_BLOCKSIZE;
    len -= (NASD_OD_BASIC_BLOCKSIZE - blkoff);
    start += (NASD_OD_BASIC_BLOCKSIZE - blkoff);
  } while(len > 0);
}

nasd_status_t
nasd_obj_setattr(
  int                 partnum,
  nasd_identifier_t   nid,
  nasd_attribute_t   *in_attrp,
  nasd_fieldmask_t    fieldmask,
  nasd_attribute_t   *out_attrp)
{
  nasd_oblkno_t new_blk, old_blk, new_fb;
  nasd_odc_prealloc_adj_handle_t pah;
  nasd_offset_t new_len, old_len;
  nasd_blkcnt_t prealloc_blocks;
  nasd_odc_ent_t *ne, *ent;
  nasd_timespec_t cur_time;
  nasd_odc_icpart_t *icp;
  int k, l, len_changed;
  nasd_status_t rc, rc2;
  nasd_od_part_t *part;
  nasd_blkrec_t blkrec;
  nasd_od_node_t *np;

  /* nasd_int64 pb_got; */
#if NASD_OD_EXT_PTR > 0 
  blkrec.odc_entp = NULL; 
  blkrec.flags = 0;
#endif /* NASD_OD_EXT_PTR > 0 */
db_printf(("setattr, nid=%" NASD_ID_FMT " in_attrp=%lx fieldmask=%lx out_attrp=%lx\n", nid, (u_long)in_attrp, (u_long)fieldmask, (u_long)out_attrp));
  if (fieldmask&(~NASD_ATTR_VALID_BITS))
    return(NASD_BAD_ATTR_SET);

  if (fieldmask&NASD_ATTR_BLOCKS_USED)
    return(NASD_BAD_ATTR_SET);

  if (fieldmask&NASD_ATTR_BLOCK_SIZE)
    return(NASD_OP_NOT_SUPPORTED);

  if (fieldmask&NASD_ATTR_LAYOUT_HINT)
    return(NASD_OP_NOT_SUPPORTED);

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc)
    return(rc);
  np = ne->data.node;

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  NASD_ODC_WLOCK_BLOCK_DATA(ne);
  NASD_ODC_LOCK_BLOCK(ne);
  nasd_odc_wait_not_busy_invalid(ne);

  nasd_gettime(&cur_time);

  if (fieldmask&NASD_ATTR_BLOCK_PREALLOCATION)
    prealloc_blocks = in_attrp->block_preallocation;
  else
    prealloc_blocks = np->blocks_preallocated;

  /*
   * Make sure we can grant preallocation here.
   * Don't do layout stuff yet, though, because we
   * might change object length.
   */
  rc = nasd_od_layout_init_adj_prealloc(partnum, ne, prealloc_blocks, &pah);
  if (rc)
    goto bad;

  len_changed = 0;
  if (fieldmask&NASD_ATTR_OBJECT_LEN) {
    old_len = np->object_len;
    new_len = in_attrp->object_len;
    if (new_len > NASD_OD_MAX_OBJ_LEN) {
      rc = NASD_BAD_ATTR_SET;
      goto bad2;
    }
    if ((new_len > NASD_ND_ATOMIC_SIZE) && (np->flags & NASD_ND_ATOMIC)) {
      /*
       * Object is currently atomic-mode, but needs to grow
       * beyond that limit.
       */
      rc = nasd_obj_deatomize(ne, partnum);
      if (rc) {
        goto bad2;
      }
    }
    if ((np->flags&NASD_ND_ATOMIC) && (new_len > old_len)
      && (new_len <= NASD_ND_ATOMIC_SIZE))
    {
      /*
       * Growing atomic-mode object, but still keeping within
       * atomic bound. Zero out relevant part.
       */
      k = np->object_len;
      l = new_len;
      bzero((char *)&np->ptrs[k], l-k);
      np->object_len = new_len;
      len_changed = 1;
    }
    if ((!(np->flags&NASD_ND_ATOMIC)) && (new_len > old_len)) {
      /*
       * Extend object. Set the new length. If the last block
       * used to be partially written, make the part of it
       * that's newly-in-bounds zero.
       */
      k = old_len % NASD_OD_BASIC_BLOCKSIZE;
      if (k) {
        old_blk = old_len / NASD_OD_BASIC_BLOCKSIZE;
        new_blk = new_len / NASD_OD_BASIC_BLOCKSIZE;
        /* last block was partially written */
        if (old_blk == new_blk) {
          l = (new_len%NASD_OD_BASIC_BLOCKSIZE) - k;
        }
        else {
          l = NASD_OD_BASIC_BLOCKSIZE - k;
        }
        NASD_ASSERT(l);
        /* zero l bytes of block old_blk at from offset k */
        /* get the block */
        rc = nasd_od_bmap(ne, old_blk, 1, 0, 0, partnum,
          0, &blkrec, NULL, NULL, NULL);
        if (rc)
          goto bad2;
        if (blkrec.blkno) {
          /* blkrec.blkno is not auto-zero-fill; get it and do the zero */
          rc = nasd_odc_block_get(ne, blkrec.blkno,
            NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD|NASD_ODC_L_NOPRE,
            &ent, ne->identifier, old_blk * NASD_OD_BASIC_BLOCKSIZE,
            NASD_ODC_T_DATA, NULL);
          if (rc)
            goto bad2;
          NASD_ASSERT(ent);
          NASD_ODC_LOCK_BLOCK(ent);
          nasd_odc_dirty_ent(ent);
          nasd_odc_wait_not_busy_invalid(ent);
          bzero((char *)&ent->data.buf[k], l);
          NASD_ODC_UNLOCK_BLOCK(ent);
          nasd_odc_block_release(ent);
        }
      }
      np->object_len = new_len;
      len_changed = 1;
    }
    if ((old_len > new_len) && (np->flags&NASD_ND_ATOMIC)) {
      /*
       * Object is atomic-mode, and we're truncating. Easy.
       */
      NASD_ASSERT(old_len <= NASD_ND_ATOMIC_SIZE);
      np->object_len = new_len;
      if (new_len == 0) {
        np->flags &= ~NASD_ND_ATOMIC;
      }
      bzero((char *)&np->ptrs[new_len], old_len-new_len);
      len_changed = 1;
    }
    if ((old_len > new_len) && (!(np->flags&NASD_ND_ATOMIC))) {
      /*
       * Truncate object. We're not atomic-mode.
       */
      old_blk = old_len / NASD_OD_BASIC_BLOCKSIZE;
      new_blk = new_len / NASD_OD_BASIC_BLOCKSIZE;
      /*
       * new_fb is first block _after_ the new last block
       * (that is, the first block to delete)
       */
      if (new_len%NASD_OD_BASIC_BLOCKSIZE)
        new_fb = new_blk + 1;
      else
        new_fb = new_blk;
      if (new_fb <= old_blk) {
        rc = nasd_od_bunmap(ne, new_fb, old_blk-new_fb+1, partnum);
        if (rc) {
          goto bad2;
        }
        rc = nasd_od_bfind_last_block(ne, partnum, new_len);
        if (rc) {
          goto bad2;
        }
      }
      np->object_len = new_len;
      len_changed = 1;
    }
    /* update timestamp */
    np->object_modify_time = cur_time;
    if (!(fieldmask&NASD_ATTR_MODIFY_TIME)) {
      np->fs_object_modify_time = cur_time;
    }
    rc = NASD_SUCCESS;
  }

  np->attr_modify_time = cur_time;

  if (fieldmask&NASD_ATTR_ATTR_MODIFY_TIME) {
    np->fs_attr_modify_time = in_attrp->fs_attr_modify_time;
  }
  else {
    np->fs_attr_modify_time = cur_time;
  }
  if (fieldmask&NASD_ATTR_MODIFY_TIME) {
    np->fs_object_modify_time = in_attrp->fs_object_modify_time;
  }
  if (fieldmask&NASD_ATTR_AV) {
    np->akvers = in_attrp->av;
    /* CAPABILITY CACHE blow away cached capabilities */
  }
  if (fieldmask&NASD_ATTR_FS_SPECIFIC) {
    bcopy((char *)in_attrp->fs_specific, (char *)np->fs_specific,
      NASD_FS_SPECIFIC_INFO_SIZE);
  }
bad2:
  if (rc) {
    /*
     * Something has gone wrong, and we're on our way out
     * with an error. Undo the preallocation.
     */
    rc2 = nasd_od_layout_cancel_adj_prealloc(partnum, ne, &pah);
    if (rc2) {
      NASD_PANIC();
    }
  }
  else {
    rc2 = nasd_od_layout_adj_prealloc(partnum, ne, &pah, len_changed);
    if (rc2)
      NASD_PANIC();
  }

bad:  
  nasd_od_bmap_release(&blkrec, 1, 0, 0);
  nasd_odc_get_attr_from_ent(ne, out_attrp, 1);
  nasd_odc_dirty_ent(ne);
  NASD_ODC_UNLOCK_BLOCK(ne);
  NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
  nasd_odc_block_release(ne);

  return(rc);
}

nasd_status_t
nasd_obj_flush(
  int                 partnum,
  nasd_identifier_t   nid)
{
  nasd_odc_ent_t *ne;
  nasd_status_t rc;

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc)
    return(rc);

  rc = nasd_odc_flush_obj(ne);

  nasd_odc_block_release(ne);

  return(rc);
}

nasd_status_t
nasd_obj_remove(
  int                 partnum,
  nasd_identifier_t   nid)
{
  nasd_odc_ent_t *ne, *npte, *npte2, *ref_ent1;
  nasd_blkno_t pte1, pte2, no, ref1, nodeblk;
  nasd_status_t rc, rc2, rc3;
  nasd_timespec_t cur_time;
  nasd_odc_icpart_t *icp;
  nasd_nodenum_t nodenum;
  nasd_blkno_t lvl2_hint;
  nasd_generation_t gen;
  nasd_od_part_t *part;
  nasd_odc_flush_t *fl;
  int dodelete, n, rn;
  nasd_oblkcnt_t cnt;
  nasd_od_node_t *np;

  /*
   * Just doing this to get the partition number- other results
   * will be rechecked following an identical decomposition in
   * nasd_odc_node_get_from_id().
   */
  rc = nasd_od_decompose_id(nid, &n, &nodenum, &lvl2_hint, &gen);
  if (rc)
    return(rc);
  if (nodenum < nasd_reserved_nodes)
    return(NASD_NOT_ON_CONTROL);
  nodenum -= nasd_reserved_nodes;

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc)
    return(rc);

  np = ne->data.node;

  nodeblk = ne->blkno;

  NASD_ODC_WLOCK_BLOCK_DATA(ne);
  NASD_ODC_LOCK_BLOCK(ne);
  nasd_odc_wait_not_busy_invalid(ne);
  nasd_odc_dirty_ent(ne);
  np->flags |= NASD_ND_DELETE;
  NASD_ODC_WUNLOCK_BLOCK_DATA(ne);
  if (!(ne->data_flags&NASD_CD_DELETING)) {
    ne->data_flags |= NASD_CD_DELETING;
    dodelete = 1;
    fl = nasd_odc_flushc_get();
    if (fl == NULL) {
      nasd_odc_block_release(ne);
      return(NASD_NO_MEM);
    }
    ne->deletecp = fl;
db_printf(("attached deletecp 0x%lx to 0x%lx\n", ne->deletecp, ne));
    fl->counter = 1;
    fl->refcnt = 1;
    NASD_ODC_UNLOCK_BLOCK(ne);
    NASD_ODC_LRU_LOCK();
    ne->lru_flags |= NASD_CL_DELETING;
    /*
     * Take an "internal" reference here so we can get the block
     * back later on, when we're sure that no one else is (or can
     * be) using it. Later, we'll turn this back into a real ref.
     */
    ne->irefcnt++;
    NASD_ODC_LRU_UNLOCK();
  }
  else {
    /* someone else is deleting- we just wait for them */
    dodelete = 0;
    fl = ne->deletecp;
    NASD_LOCK_MUTEX(fl->lock);
    fl->refcnt++;
    NASD_UNLOCK_MUTEX(fl->lock);
    NASD_ODC_UNLOCK_BLOCK(ne);
  }
  nasd_odc_block_release(ne);

  NASD_LOCK_MUTEX(fl->lock);
  while(fl->counter) {
    NASD_WAIT_COND(fl->cond,fl->lock);
  }
  fl->refcnt--;
  n = fl->refcnt;
  NASD_UNLOCK_MUTEX(fl->lock);

  if (n == 0) {
    /* we were last out- clean up */
    nasd_odc_flushc_free(fl);
  }

  if (dodelete == 0) {
    /* we are absolved of responsibility */
    return(NASD_SUCCESS);
  }

  /*
   * Convert our cache-internal reference to a real
   * reference (see above)
   */
  nasd_odc_block_iref_to_ref(ne);

  /*
   * Now we are guaranteed that we are the only users of
   * this object, and no one else may start using it.
   * This lets us avoid worrying about data locks here.
   * First, get rid of the object's data blocks.
   */
  if (!(np->flags&NASD_ND_ATOMIC)) {
    /*
     * Only get rid of blocks if we're non-atomic
     * (otherwise, those "pointers" are really data).
     */
    cnt = (np->object_len + (NASD_OD_BASIC_BLOCKSIZE - 1))
      / NASD_OD_BASIC_BLOCKSIZE;
    if (cnt) {
      rc = nasd_od_bunmap(ne, 0, cnt, partnum);
      if (rc) {
        /* XXX get rid of pte anyway? */
        return(rc);
      }
    }
  }

  /*
   * Now we have a zero-length object that we must delete.
   * First, get rid of node pagetable entries.
   */

  part = &PART(partnum);
  icp = &nasd_odc_state->parts[partnum];

  no = (nodenum / NASD_OD_NODES_PER_NPT_BLOCK);
  n = nodenum % NASD_OD_NODES_PER_NPT_BLOCK;
  pte1 = nasd_odc_state->disk->npt_ext.first + no;
  pte2 = nasd_odc_state->disk->npt2_ext.first + no;
  rn = NASD_ODC_OFF_IN_REFBLK(pte1);
  ref1 = NASD_ODC_REFBLK_OF(pte1);

  NASD_ODC_ICPART_LOCK_WRITE(icp);
  nasd_gettime(&cur_time);
  part->last_cr_del = cur_time;
  icp->last_objlist_npt = 0;
  icp->last_objlist_off = 0;

  rc2 = nasd_odc_block_get(NULL, pte2,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
    &npte2, NASD_ID_NULL, 0, NASD_ODC_T_NPT2, NULL);
  rc = nasd_odc_block_get(NULL, pte1,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
    &npte, NASD_ID_NULL, 0, NASD_ODC_T_NPT1, NULL);
  rc3 = nasd_odc_block_get(NULL, ref1,
    NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
    &ref_ent1, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
  if (rc2 == NASD_SUCCESS) {
    NASD_ODC_WLOCK_BLOCK_DATA(npte2);
    NASD_ODC_LOCK_BLOCK(npte2);
    nasd_odc_dirty_ent(npte2);
    nasd_odc_wait_not_busy_invalid(npte2);
    NASD_ASSERT(npte2->data.pte[n].blkno == nodeblk);
    npte2->data.pte[n].blkno = 0;
    NASD_ODC_UNLOCK_BLOCK(npte2);
    NASD_ODC_WUNLOCK_BLOCK_DATA(npte2);
    nasd_odc_block_release(npte2);
  }
  else {
    NASD_PANIC();
  }
  if (rc == NASD_SUCCESS) {
    NASD_ODC_WLOCK_BLOCK_DATA(npte);
    NASD_ODC_LOCK_BLOCK(npte);
    nasd_odc_dirty_ent(npte);
    nasd_odc_wait_not_busy_invalid(npte);
    NASD_ASSERT(npte->data.pte[n].blkno == nodeblk);
    npte->data.pte[n].blkno = 0;
    NASD_ODC_UNLOCK_BLOCK(npte);
    NASD_ODC_WUNLOCK_BLOCK_DATA(npte);
    nasd_odc_block_release(npte);
  }
  else {
    NASD_PANIC();
  }
  if (rc3 == NASD_SUCCESS) {
    NASD_ODC_LOCK_BLOCK(ref_ent1);
    nasd_odc_wait_not_busy_invalid(ref_ent1);
    ref_ent1->data.cnt[rn]--;
    NASD_ODC_UNLOCK_BLOCK(ref_ent1);
    nasd_odc_block_release(ref_ent1);
  }
  else {
    NASD_PANIC();
  }

  part->num_obj--;
  nasd_part_modified(partnum);
  NASD_ODC_ICPART_UNLOCK_WRITE(icp);

  /*
   * Lose preallocation
   */
  rc = nasd_od_layout_surrender_prealloc(partnum, ne);
  if (rc) {
    NASD_PANIC();
  }

  /*
   * Finally, lose the node block itself entirely.
   */
  rc = nasd_od_layout_node_deleting(partnum, ne);
  if (rc)
    NASD_PANIC();

  if (rc2 != NASD_SUCCESS)
    return(rc2);
  return(rc);
}

nasd_status_t
nasd_obj_eject(
  int                 partnum,
  nasd_identifier_t   nid)
{
  nasd_odc_ent_t *ne;
  nasd_status_t rc;

  /* CAPABILITY CACHE */
  /*nasd_security_cache_invalidate(partnum, nid);*/

  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc)
    return(rc);

  /*
   * It's not really necessary to flush the object here,
   * because nasd_odc_obj_eject() will do the right thing
   * with dirty blocks (good thing, too, since we don't
   * do any locking here). However, we want to flush the
   * object to be more efficient about batching up lots
   * of dirty blocks.
   */
  rc = nasd_odc_flush_obj(ne);

  if (rc == NASD_SUCCESS) {
    rc = nasd_odc_obj_eject(ne);
  }

  nasd_odc_block_release(ne);

  return(rc);
}

nasd_status_t
nasd_obj_start_iread(int partnum,
                     nasd_identifier_t index_nid,
                     nasd_identifier_t data_nid,
                     int interval,
                     int offset,
                     int flownum,
                     nasd_timespec_t earliest_start,
                     nasd_timespec_t latest_start,
                     nasd_client_addr_t client_addr,
                     nasd_index_stream_identifier_t *out_stream_id)
{
  return NASD_OP_NOT_SUPPORTED;
}

nasd_status_t
nasd_obj_stop_iread(nasd_index_stream_identifier_t stream_id)
{
  return NASD_OP_NOT_SUPPORTED;
}


nasd_status_t
nasd_obj_remote_attach(
  int                         partnum,
  nasd_identifier_t           nid,
  nasd_remote_function_name_t name,
  nasd_len_t                  args_otw_len,
  nasd_procpipe_t            *byte_pipe,
  nasd_security_context_t    *contextp)
{
  nasd_remote_invocation_t      *invocation;
  nasd_odc_ent_t                *ne=NULL;
  nasd_len_t                    remaining,count;
  nasd_byte_t                   *buf;
  nasd_status_t                 toret=NASD_SUCCESS;
  nasd_status_t                 rc;
  int                           i;
  nasd_remote_function_t        *p;

  NASD_FREELIST_GET(remote_invocation_freelist,invocation,next,(nasd_remote_invocation_t *));
  
  if (!invocation) {
    goto exit_no_free;
  }
  memset(invocation,0,sizeof(nasd_remote_invocation_t)); /* XXX should use freelist initialization */

  invocation->args_otw_len = args_otw_len;
  NASD_Malloc(invocation->args_otw,invocation->args_otw_len,(nasd_byte_t *));
  if (!invocation->args_otw) {
    goto exit_free_invocation;
  }

  remaining = invocation->args_otw_len;
  buf = invocation->args_otw;

  while (remaining) {
    /* XXX ignores security */
    rc = byte_pipe->pull(byte_pipe->state,
                         (nasd_byte_t *)buf,remaining,&count,
                         NULL,NULL,NULL,NULL);    /* digest, digest_valid, commit, commit_rock */
    if (rc) {
      toret = NASD_REMOTE_ARGS_FAIL;
      goto exit_free_invocation_args;
    }
    buf+=count;
    remaining-=count;
  }
  for (p=nasd_remote_dispatch_table;p->function_name[0];p++) {
    if (0 == strncmp(p->function_name,name,NASD_REMOTE_FUNCTION_NAME_SIZE)) {
      break;
    }
  }
  if (0==p->function_name[0]) {
    toret = NASD_REMOTE_BAD_FUNCTION;
    goto exit_free_invocation_args;
  }
  invocation->f = p;
  
  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  
  if (rc) {
    toret = rc;
    goto exit_free_invocation_args;
  }
  NASD_ODC_LRU_LOCK();  
  /* We're attaching an invocation to the node entry, so we need to
     make sure it remains in-core */
  ne->refcnt++;
  NASD_ODC_LRU_UNLOCK();

  NASD_ODC_LOCK_BLOCK(ne);
  if (ne->invocation) {
    toret = NASD_REMOTE_ALREADY_ATTACHED;
    NASD_ODC_UNLOCK_BLOCK(ne);
    goto exit_unref_and_detach;
  }
  ne->invocation = invocation;
  NASD_ODC_UNLOCK_BLOCK(ne);
  if (rc = ne->invocation->f->attach(ne)) {
    toret = rc;
    goto exit_unref_and_detach;
  }
  goto exit_no_free;

 exit_unref_and_detach:
  NASD_ODC_LRU_LOCK();  
  ne->refcnt--;
  NASD_ODC_LRU_UNLOCK();
  ne->invocation = NULL;
 exit_free_invocation_args:
  NASD_Free(invocation->args_otw,invocation->args_otw_len);
 exit_free_invocation:
  NASD_FREELIST_FREE(remote_invocation_freelist,invocation,next);
 exit_no_free:
  if (ne) nasd_odc_block_release(ne);
  return toret;
}

nasd_status_t
nasd_obj_remote_detach(
  int                         partnum,
  nasd_identifier_t           nid)
{
  nasd_status_t                 rc,toret;
  nasd_odc_ent_t                *ne;
  
  rc = nasd_odc_node_get_from_id(partnum, nid, &ne);
  if (rc) return rc;
  NASD_ODC_LOCK_BLOCK(ne);
  if (!ne->invocation) {
    NASD_ODC_UNLOCK_BLOCK(ne);
    return NASD_REMOTE_UNATTACHED;
  }
  toret = ne->invocation->f->detach(ne);

  
  if (ne->invocation->args_otw) {
    NASD_Free(ne->invocation->args_otw,ne->invocation->args_otw_len);
  }

  ne->invocation=NULL;

  NASD_ODC_UNLOCK_BLOCK(ne);
  NASD_ODC_LRU_LOCK();
  ne->refcnt--;
  NASD_ODC_LRU_UNLOCK();
  nasd_odc_block_release(ne);   /* This decrements refcnt again to
                                   compensate for the increment
                                   implicit in
                                   nasd_odc_node_get_from_id */

  return toret;
}

nasd_status_t
nasd_obj_remote_invoke(
  int                       partnum,
  nasd_identifier_t         nid,
  nasd_offset_t             offset,
  nasd_len_t                in_len,
  nasd_procpipe_t          *byte_pipe,
  nasd_len_t               *out_len,
  nasd_security_context_t  *contextp)
{
  return nasd_obj_read_simple(partnum,nid,offset,in_len,NULL,0,1,byte_pipe,out_len,contextp);
}




/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
