1b2441318SGreg Kroah-Hartman /* SPDX-License-Identifier: GPL-2.0 */
23d14c5d2SYehuda Sadeh #ifndef _FS_CEPH_OSDMAP_H
33d14c5d2SYehuda Sadeh #define _FS_CEPH_OSDMAP_H
43d14c5d2SYehuda Sadeh
53d14c5d2SYehuda Sadeh #include <linux/rbtree.h>
6a1ce3928SDavid Howells #include <linux/ceph/types.h>
7ef4859d6SAlex Elder #include <linux/ceph/decode.h>
83d14c5d2SYehuda Sadeh #include <linux/crush/crush.h>
93d14c5d2SYehuda Sadeh
103d14c5d2SYehuda Sadeh /*
113d14c5d2SYehuda Sadeh * The osd map describes the current membership of the osd cluster and
123d14c5d2SYehuda Sadeh * specifies the mapping of objects to placement groups and placement
133d14c5d2SYehuda Sadeh * groups to (sets of) osds. That is, it completely specifies the
143d14c5d2SYehuda Sadeh * (desired) distribution of all data objects in the system at some
153d14c5d2SYehuda Sadeh * point in time.
163d14c5d2SYehuda Sadeh *
173d14c5d2SYehuda Sadeh * Each map version is identified by an epoch, which increases monotonically.
183d14c5d2SYehuda Sadeh *
193d14c5d2SYehuda Sadeh * The map can be updated either via an incremental map (diff) describing
203d14c5d2SYehuda Sadeh * the change between two successive epochs, or as a fully encoded map.
213d14c5d2SYehuda Sadeh */
225b191d99SSage Weil struct ceph_pg {
235b191d99SSage Weil uint64_t pool;
245b191d99SSage Weil uint32_t seed;
255b191d99SSage Weil };
265b191d99SSage Weil
27dc98ff72SIlya Dryomov #define CEPH_SPG_NOSHARD -1
28dc98ff72SIlya Dryomov
29dc98ff72SIlya Dryomov struct ceph_spg {
30dc98ff72SIlya Dryomov struct ceph_pg pgid;
31dc98ff72SIlya Dryomov s8 shard;
32dc98ff72SIlya Dryomov };
33dc98ff72SIlya Dryomov
34f984cb76SIlya Dryomov int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
35a02a946dSIlya Dryomov int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
36f984cb76SIlya Dryomov
3704812acfSIlya Dryomov #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
3804812acfSIlya Dryomov together */
3963244fa1SIlya Dryomov #define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
4076142097SIlya Dryomov #define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota,
4176142097SIlya Dryomov will set FULL too */
4276142097SIlya Dryomov #define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */
4383ca14fdSSage Weil
443d14c5d2SYehuda Sadeh struct ceph_pg_pool_info {
453d14c5d2SYehuda Sadeh struct rb_node node;
464f6a7e5eSSage Weil s64 id;
4704812acfSIlya Dryomov u8 type; /* CEPH_POOL_TYPE_* */
484f6a7e5eSSage Weil u8 size;
4904812acfSIlya Dryomov u8 min_size;
504f6a7e5eSSage Weil u8 crush_ruleset;
514f6a7e5eSSage Weil u8 object_hash;
5204812acfSIlya Dryomov u32 last_force_request_resend;
534f6a7e5eSSage Weil u32 pg_num, pgp_num;
544f6a7e5eSSage Weil int pg_num_mask, pgp_num_mask;
5517a13e40SIlya Dryomov s64 read_tier;
5617a13e40SIlya Dryomov s64 write_tier; /* wins for read+write ops */
5704812acfSIlya Dryomov u64 flags; /* CEPH_POOL_FLAG_* */
583d14c5d2SYehuda Sadeh char *name;
5942c1b124SIlya Dryomov
6042c1b124SIlya Dryomov bool was_full; /* for handle_one_map() */
613d14c5d2SYehuda Sadeh };
623d14c5d2SYehuda Sadeh
ceph_can_shift_osds(struct ceph_pg_pool_info * pool)632abebdbcSIlya Dryomov static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
642abebdbcSIlya Dryomov {
652abebdbcSIlya Dryomov switch (pool->type) {
662abebdbcSIlya Dryomov case CEPH_POOL_TYPE_REP:
672abebdbcSIlya Dryomov return true;
682abebdbcSIlya Dryomov case CEPH_POOL_TYPE_EC:
692abebdbcSIlya Dryomov return false;
702abebdbcSIlya Dryomov default:
71d24cdcd3SArnd Bergmann BUG();
722abebdbcSIlya Dryomov }
732abebdbcSIlya Dryomov }
742abebdbcSIlya Dryomov
754f6a7e5eSSage Weil struct ceph_object_locator {
7622116525SIlya Dryomov s64 pool;
7730c156d9SYan, Zheng struct ceph_string *pool_ns;
784f6a7e5eSSage Weil };
794f6a7e5eSSage Weil
ceph_oloc_init(struct ceph_object_locator * oloc)8063244fa1SIlya Dryomov static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
8163244fa1SIlya Dryomov {
8263244fa1SIlya Dryomov oloc->pool = -1;
8330c156d9SYan, Zheng oloc->pool_ns = NULL;
8463244fa1SIlya Dryomov }
8563244fa1SIlya Dryomov
ceph_oloc_empty(const struct ceph_object_locator * oloc)8663244fa1SIlya Dryomov static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
8763244fa1SIlya Dryomov {
8863244fa1SIlya Dryomov return oloc->pool == -1;
8963244fa1SIlya Dryomov }
9063244fa1SIlya Dryomov
9130c156d9SYan, Zheng void ceph_oloc_copy(struct ceph_object_locator *dest,
9230c156d9SYan, Zheng const struct ceph_object_locator *src);
9330c156d9SYan, Zheng void ceph_oloc_destroy(struct ceph_object_locator *oloc);
9463244fa1SIlya Dryomov
954295f221SIlya Dryomov /*
96d30291b9SIlya Dryomov * 51-char inline_name is long enough for all cephfs and all but one
97d30291b9SIlya Dryomov * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
98d30291b9SIlya Dryomov * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
99d30291b9SIlya Dryomov * other rbd requests fit into inline_name.
100d30291b9SIlya Dryomov *
101d30291b9SIlya Dryomov * Makes ceph_object_id 64 bytes on 64-bit.
102d30291b9SIlya Dryomov */
103d30291b9SIlya Dryomov #define CEPH_OID_INLINE_LEN 52
104d30291b9SIlya Dryomov
105d30291b9SIlya Dryomov /*
106d30291b9SIlya Dryomov * Both inline and external buffers have space for a NUL-terminator,
107d30291b9SIlya Dryomov * which is carried around. It's not required though - RADOS object
108d30291b9SIlya Dryomov * names don't have to be NUL-terminated and may contain NULs.
109d30291b9SIlya Dryomov */
1104295f221SIlya Dryomov struct ceph_object_id {
111d30291b9SIlya Dryomov char *name;
112d30291b9SIlya Dryomov char inline_name[CEPH_OID_INLINE_LEN];
1134295f221SIlya Dryomov int name_len;
1144295f221SIlya Dryomov };
1154295f221SIlya Dryomov
1160384892cSArnd Bergmann #define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
1170384892cSArnd Bergmann
1180384892cSArnd Bergmann #define CEPH_DEFINE_OID_ONSTACK(oid) \
1190384892cSArnd Bergmann struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
1200384892cSArnd Bergmann
ceph_oid_init(struct ceph_object_id * oid)121d30291b9SIlya Dryomov static inline void ceph_oid_init(struct ceph_object_id *oid)
122d30291b9SIlya Dryomov {
1230384892cSArnd Bergmann *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
124d30291b9SIlya Dryomov }
125d30291b9SIlya Dryomov
ceph_oid_empty(const struct ceph_object_id * oid)126d30291b9SIlya Dryomov static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
127d30291b9SIlya Dryomov {
128d30291b9SIlya Dryomov return oid->name == oid->inline_name && !oid->name_len;
129d30291b9SIlya Dryomov }
130d30291b9SIlya Dryomov
131d30291b9SIlya Dryomov void ceph_oid_copy(struct ceph_object_id *dest,
132d30291b9SIlya Dryomov const struct ceph_object_id *src);
133d30291b9SIlya Dryomov __printf(2, 3)
134d30291b9SIlya Dryomov void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
135d30291b9SIlya Dryomov __printf(3, 4)
136d30291b9SIlya Dryomov int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
137d30291b9SIlya Dryomov const char *fmt, ...);
138d30291b9SIlya Dryomov void ceph_oid_destroy(struct ceph_object_id *oid);
139d30291b9SIlya Dryomov
1403986f9a4SIlya Dryomov struct workspace_manager {
1413986f9a4SIlya Dryomov struct list_head idle_ws;
1423986f9a4SIlya Dryomov spinlock_t ws_lock;
1433986f9a4SIlya Dryomov /* Number of free workspaces */
1443986f9a4SIlya Dryomov int free_ws;
1453986f9a4SIlya Dryomov /* Total number of allocated workspaces */
1463986f9a4SIlya Dryomov atomic_t total_ws;
1473986f9a4SIlya Dryomov /* Waiters for a free workspace */
1483986f9a4SIlya Dryomov wait_queue_head_t ws_wait;
1493986f9a4SIlya Dryomov };
1503986f9a4SIlya Dryomov
1513d14c5d2SYehuda Sadeh struct ceph_pg_mapping {
1523d14c5d2SYehuda Sadeh struct rb_node node;
1535b191d99SSage Weil struct ceph_pg pgid;
15435a935d7SIlya Dryomov
15535a935d7SIlya Dryomov union {
15635a935d7SIlya Dryomov struct {
1573d14c5d2SYehuda Sadeh int len;
1583d14c5d2SYehuda Sadeh int osds[];
1596f428df4SIlya Dryomov } pg_temp, pg_upmap;
1609686f94cSIlya Dryomov struct {
1619686f94cSIlya Dryomov int osd;
1629686f94cSIlya Dryomov } primary_temp;
1636f428df4SIlya Dryomov struct {
1646f428df4SIlya Dryomov int len;
1656f428df4SIlya Dryomov int from_to[][2];
1666f428df4SIlya Dryomov } pg_upmap_items;
16735a935d7SIlya Dryomov };
1683d14c5d2SYehuda Sadeh };
1693d14c5d2SYehuda Sadeh
1703d14c5d2SYehuda Sadeh struct ceph_osdmap {
1713d14c5d2SYehuda Sadeh struct ceph_fsid fsid;
1723d14c5d2SYehuda Sadeh u32 epoch;
1733d14c5d2SYehuda Sadeh struct ceph_timespec created, modified;
1743d14c5d2SYehuda Sadeh
1753d14c5d2SYehuda Sadeh u32 flags; /* CEPH_OSDMAP_* */
1763d14c5d2SYehuda Sadeh
1773d14c5d2SYehuda Sadeh u32 max_osd; /* size of osd_state, _offload, _addr arrays */
1780bb05da2SIlya Dryomov u32 *osd_state; /* CEPH_OSD_* */
1793d14c5d2SYehuda Sadeh u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
1803d14c5d2SYehuda Sadeh struct ceph_entity_addr *osd_addr;
1813d14c5d2SYehuda Sadeh
1823d14c5d2SYehuda Sadeh struct rb_root pg_temp;
1839686f94cSIlya Dryomov struct rb_root primary_temp;
1849686f94cSIlya Dryomov
1856f428df4SIlya Dryomov /* remap (post-CRUSH, pre-up) */
1866f428df4SIlya Dryomov struct rb_root pg_upmap; /* PG := raw set */
1876f428df4SIlya Dryomov struct rb_root pg_upmap_items; /* from -> to within raw set */
1886f428df4SIlya Dryomov
1892cfa34f2SIlya Dryomov u32 *osd_primary_affinity;
1902cfa34f2SIlya Dryomov
1913d14c5d2SYehuda Sadeh struct rb_root pg_pools;
1923d14c5d2SYehuda Sadeh u32 pool_max;
1933d14c5d2SYehuda Sadeh
1943d14c5d2SYehuda Sadeh /* the CRUSH map specifies the mapping of placement groups to
1953d14c5d2SYehuda Sadeh * the list of osds that store+replicate them. */
1963d14c5d2SYehuda Sadeh struct crush_map *crush;
1979d521470SIlya Dryomov
1983986f9a4SIlya Dryomov struct workspace_manager crush_wsm;
1993d14c5d2SYehuda Sadeh };
2003d14c5d2SYehuda Sadeh
ceph_osd_exists(struct ceph_osdmap * map,int osd)2013b33f692SZhang Zhuoyu static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
202246138faSIlya Dryomov {
203246138faSIlya Dryomov return osd >= 0 && osd < map->max_osd &&
204246138faSIlya Dryomov (map->osd_state[osd] & CEPH_OSD_EXISTS);
205246138faSIlya Dryomov }
206246138faSIlya Dryomov
ceph_osd_is_up(struct ceph_osdmap * map,int osd)2073b33f692SZhang Zhuoyu static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
2083d14c5d2SYehuda Sadeh {
209246138faSIlya Dryomov return ceph_osd_exists(map, osd) &&
210246138faSIlya Dryomov (map->osd_state[osd] & CEPH_OSD_UP);
211246138faSIlya Dryomov }
212246138faSIlya Dryomov
ceph_osd_is_down(struct ceph_osdmap * map,int osd)2133b33f692SZhang Zhuoyu static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
214246138faSIlya Dryomov {
215246138faSIlya Dryomov return !ceph_osd_is_up(map, osd);
2163d14c5d2SYehuda Sadeh }
2173d14c5d2SYehuda Sadeh
2180bb05da2SIlya Dryomov char *ceph_osdmap_state_str(char *str, int len, u32 state);
2192cfa34f2SIlya Dryomov extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
2203d14c5d2SYehuda Sadeh
ceph_osd_addr(struct ceph_osdmap * map,int osd)2213d14c5d2SYehuda Sadeh static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
2223d14c5d2SYehuda Sadeh int osd)
2233d14c5d2SYehuda Sadeh {
2243d14c5d2SYehuda Sadeh if (osd >= map->max_osd)
2253d14c5d2SYehuda Sadeh return NULL;
2263d14c5d2SYehuda Sadeh return &map->osd_addr[osd];
2273d14c5d2SYehuda Sadeh }
2283d14c5d2SYehuda Sadeh
2298cb441c0SIlya Dryomov #define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
2308cb441c0SIlya Dryomov
ceph_decode_pgid(void ** p,void * end,struct ceph_pg * pgid)231ef4859d6SAlex Elder static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
232ef4859d6SAlex Elder {
233ef4859d6SAlex Elder __u8 version;
234ef4859d6SAlex Elder
2358cb441c0SIlya Dryomov if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
2363ef650d3SJoe Perches pr_warn("incomplete pg encoding\n");
237ef4859d6SAlex Elder return -EINVAL;
238ef4859d6SAlex Elder }
239ef4859d6SAlex Elder version = ceph_decode_8(p);
240ef4859d6SAlex Elder if (version > 1) {
2413ef650d3SJoe Perches pr_warn("do not understand pg encoding %d > 1\n",
242ef4859d6SAlex Elder (int)version);
243ef4859d6SAlex Elder return -EINVAL;
244ef4859d6SAlex Elder }
245ef4859d6SAlex Elder
246ef4859d6SAlex Elder pgid->pool = ceph_decode_64(p);
247ef4859d6SAlex Elder pgid->seed = ceph_decode_32(p);
248ef4859d6SAlex Elder *p += 4; /* skip deprecated preferred value */
249ef4859d6SAlex Elder
250ef4859d6SAlex Elder return 0;
251ef4859d6SAlex Elder }
252ef4859d6SAlex Elder
253e5253a7bSIlya Dryomov struct ceph_osdmap *ceph_osdmap_alloc(void);
254*a5cbd5fcSIlya Dryomov struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
255*a5cbd5fcSIlya Dryomov struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
2560c0a8de1SIlya Dryomov struct ceph_osdmap *map);
2573d14c5d2SYehuda Sadeh extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
2583d14c5d2SYehuda Sadeh
2596f3bfd45SIlya Dryomov struct ceph_osds {
2606f3bfd45SIlya Dryomov int osds[CEPH_PG_MAX_SIZE];
2616f3bfd45SIlya Dryomov int size;
2626f3bfd45SIlya Dryomov int primary; /* id, NOT index */
2636f3bfd45SIlya Dryomov };
2646f3bfd45SIlya Dryomov
ceph_osds_init(struct ceph_osds * set)2656f3bfd45SIlya Dryomov static inline void ceph_osds_init(struct ceph_osds *set)
2666f3bfd45SIlya Dryomov {
2676f3bfd45SIlya Dryomov set->size = 0;
2686f3bfd45SIlya Dryomov set->primary = -1;
2696f3bfd45SIlya Dryomov }
2706f3bfd45SIlya Dryomov
2716f3bfd45SIlya Dryomov void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
2726f3bfd45SIlya Dryomov
2737de030d6SIlya Dryomov bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
2747de030d6SIlya Dryomov u32 new_pg_num);
27563244fa1SIlya Dryomov bool ceph_is_new_interval(const struct ceph_osds *old_acting,
27663244fa1SIlya Dryomov const struct ceph_osds *new_acting,
27763244fa1SIlya Dryomov const struct ceph_osds *old_up,
27863244fa1SIlya Dryomov const struct ceph_osds *new_up,
27963244fa1SIlya Dryomov int old_size,
28063244fa1SIlya Dryomov int new_size,
28163244fa1SIlya Dryomov int old_min_size,
28263244fa1SIlya Dryomov int new_min_size,
28363244fa1SIlya Dryomov u32 old_pg_num,
28463244fa1SIlya Dryomov u32 new_pg_num,
28563244fa1SIlya Dryomov bool old_sort_bitwise,
28663244fa1SIlya Dryomov bool new_sort_bitwise,
287ae78dd81SIlya Dryomov bool old_recovery_deletes,
288ae78dd81SIlya Dryomov bool new_recovery_deletes,
28963244fa1SIlya Dryomov const struct ceph_pg *pgid);
29063244fa1SIlya Dryomov bool ceph_osds_changed(const struct ceph_osds *old_acting,
29163244fa1SIlya Dryomov const struct ceph_osds *new_acting,
29263244fa1SIlya Dryomov bool any_change);
29363244fa1SIlya Dryomov
294a86f009fSIlya Dryomov void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
295df28152dSIlya Dryomov const struct ceph_object_id *oid,
296df28152dSIlya Dryomov const struct ceph_object_locator *oloc,
297df28152dSIlya Dryomov struct ceph_pg *raw_pgid);
298d9591f5eSIlya Dryomov int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
299df28152dSIlya Dryomov const struct ceph_object_id *oid,
300df28152dSIlya Dryomov const struct ceph_object_locator *oloc,
301d9591f5eSIlya Dryomov struct ceph_pg *raw_pgid);
3027c13cb64SIlya Dryomov
3036f3bfd45SIlya Dryomov void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
304df28152dSIlya Dryomov struct ceph_pg_pool_info *pi,
3056f3bfd45SIlya Dryomov const struct ceph_pg *raw_pgid,
3066f3bfd45SIlya Dryomov struct ceph_osds *up,
3076f3bfd45SIlya Dryomov struct ceph_osds *acting);
308dc98ff72SIlya Dryomov bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
309df28152dSIlya Dryomov struct ceph_pg_pool_info *pi,
310dc98ff72SIlya Dryomov const struct ceph_pg *raw_pgid,
311dc98ff72SIlya Dryomov struct ceph_spg *spgid);
312f81f1633SIlya Dryomov int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
313f81f1633SIlya Dryomov const struct ceph_pg *raw_pgid);
3143d14c5d2SYehuda Sadeh
31545e6aa9fSIlya Dryomov struct crush_loc {
31645e6aa9fSIlya Dryomov char *cl_type_name;
31745e6aa9fSIlya Dryomov char *cl_name;
31845e6aa9fSIlya Dryomov };
31945e6aa9fSIlya Dryomov
32045e6aa9fSIlya Dryomov struct crush_loc_node {
32145e6aa9fSIlya Dryomov struct rb_node cl_node;
32245e6aa9fSIlya Dryomov struct crush_loc cl_loc; /* pointers into cl_data */
32345e6aa9fSIlya Dryomov char cl_data[];
32445e6aa9fSIlya Dryomov };
32545e6aa9fSIlya Dryomov
32645e6aa9fSIlya Dryomov int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
32745e6aa9fSIlya Dryomov int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
32845e6aa9fSIlya Dryomov void ceph_clear_crush_locs(struct rb_root *locs);
32945e6aa9fSIlya Dryomov
330117d96a0SIlya Dryomov int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
331117d96a0SIlya Dryomov struct rb_root *locs);
332117d96a0SIlya Dryomov
333ce7f6a27SIlya Dryomov extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
334ce7f6a27SIlya Dryomov u64 id);
33572afc71fSAlex Elder extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
3363d14c5d2SYehuda Sadeh extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
33776142097SIlya Dryomov u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
3383d14c5d2SYehuda Sadeh
3393d14c5d2SYehuda Sadeh #endif
340